From e4caaa2a77d55d0f5333fc3af37e09fd4e20c9d9 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Mon, 5 Oct 2020 13:20:58 -0400 Subject: [PATCH] Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next"" This reverts commit 28b17d3dbd52180189111503ed46706a6dd31b1c. Reason for revert: Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9 [ROCm/hip commit: ad2d55c144f794e4e029fd49f6c5759470a5d705] --- projects/hip/CMakeLists.txt | 52 +- projects/hip/CONTRIBUTING.md | 104 ++- projects/hip/bin/hip_embed_pch.sh | 9 +- projects/hip/bin/hip_gen_pch.sh | 36 - projects/hip/bin/hipcc | 3 +- projects/hip/docs/markdown/hip_faq.md | 2 +- projects/hip/hip-config.cmake.in | 28 +- .../include/hip/hcc_detail/device_functions.h | 138 +++- .../hip/include/hip/hcc_detail/hip_runtime.h | 16 + .../include/hip/hcc_detail/hip_runtime_api.h | 13 +- .../hip/include/hip/hip_cooperative_groups.h | 9 +- projects/hip/include/hip/hip_runtime_api.h | 1 + .../hip/nvcc_detail/hip_cooperative_groups.h | 12 + .../hip/include/hip/nvcc_detail/hip_runtime.h | 4 +- .../include/hip/nvcc_detail/hip_runtime_api.h | 1 + projects/hip/lpl_ca/CMakeLists.txt | 2 + projects/hip/packaging/hip-base.txt | 11 +- projects/hip/packaging/hip-doc.txt | 15 +- projects/hip/packaging/hip-hcc.txt | 13 +- projects/hip/packaging/hip-nvcc.txt | 13 +- projects/hip/packaging/hip-rocclr.txt | 13 +- projects/hip/packaging/hip-samples.txt | 13 +- projects/hip/rocclr/CMakeLists.txt | 30 +- projects/hip/rocclr/hip_code_object.cpp | 25 - projects/hip/rocclr/hip_code_object.hpp | 2 - projects/hip/rocclr/hip_device.cpp | 2 +- projects/hip/rocclr/hip_fatbin.cpp | 7 +- projects/hip/rocclr/hip_global.cpp | 4 +- projects/hip/rocclr/hip_global.hpp | 5 - projects/hip/rocclr/hip_internal.hpp | 2 - projects/hip/rocclr/hip_memory.cpp | 73 +- projects/hip/rocclr/hip_module.cpp | 2 +- projects/hip/rocclr/hip_peer.cpp | 4 + projects/hip/rocclr/hip_platform.cpp | 41 +- projects/hip/rocclr/hip_platform.hpp | 5 - .../0_Intro/bit_extract/CMakeLists.txt | 20 + .../hip/samples/0_Intro/bit_extract/Makefile | 10 +- .../samples/0_Intro/module_api/CMakeLists.txt | 36 + .../0_Intro/module_api_global/CMakeLists.txt | 30 + .../hip/samples/0_Intro/square/CMakeLists.txt | 21 + projects/hip/samples/0_Intro/square/Makefile | 7 +- projects/hip/samples/0_Intro/square/README.md | 42 +- .../1_Utils/hipBusBandwidth/CMakeLists.txt | 20 + .../hipBusBandwidth/hipBusBandwidth.cpp | 446 +++++------ .../1_Utils/hipCommander/CMakeLists.txt | 31 + .../1_Utils/hipDispatchLatency/CMakeLists.txt | 35 + .../samples/1_Utils/hipInfo/CMakeLists.txt | 20 + .../0_MatrixTranspose/CMakeLists.txt | 20 + .../2_Cookbook/10_inline_asm/CMakeLists.txt | 20 + .../11_texture_driver/CMakeLists.txt | 30 + .../2_Cookbook/13_occupancy/CMakeLists.txt | 20 + .../2_Cookbook/1_hipEvent/CMakeLists.txt | 20 + .../2_Cookbook/3_shared_memory/CMakeLists.txt | 20 + .../samples/2_Cookbook/4_shfl/CMakeLists.txt | 20 + .../2_Cookbook/5_2dshfl/CMakeLists.txt | 19 + .../6_dynamic_shared/CMakeLists.txt | 19 + .../2_Cookbook/7_streams/CMakeLists.txt | 19 + .../2_Cookbook/8_peer2peer/CMakeLists.txt | 19 + .../2_Cookbook/9_unroll/CMakeLists.txt | 19 + projects/hip/samples/README.md | 27 + projects/hip/tests/hit/HIT.cmake | 1 + .../performance/compute/hipPerfMandelbrot.cpp | 747 ++++++++++++++++++ .../stream/hipPerfDeviceConcurrency.cpp | 289 +++++++ .../hip/tests/src/kernel/hipShflTests.cpp | 17 +- .../tests/src/kernel/hipShflUpDownTest.cpp | 64 +- .../tests/src/p2p/hipPeerToPeer_simple.cpp | 3 + .../cooperativeGrps/api_failure_tests.cpp | 280 +++++++ .../cooperativeGrps/cooperative_streams.cpp | 283 +++++++ .../grid_group_data_sharing.cpp | 303 +++++++ .../cooperativeGrps}/hipCGGridGroupType.cpp | 8 +- .../hipCGGridGroupTypeViaBaseType.cpp | 8 +- .../hipCGGridGroupTypeViaPublicApi.cpp | 8 +- .../hipCGMultiGridGroupType.cpp | 22 +- .../hipCGMultiGridGroupTypeViaBaseType.cpp | 39 +- .../hipCGMultiGridGroupTypeViaPublicApi.cpp | 39 +- .../cooperativeGrps}/hipCGThreadBlockType.cpp | 12 +- .../hipCGThreadBlockTypeViaBaseType.cpp | 12 +- .../hipCGThreadBlockTypeViaPublicApi.cpp | 12 +- .../hipLaunchCoopMultiKernel.cpp | 2 +- .../hipLaunchCooperativeKernel.cpp | 7 +- .../multi_gpu_api_failure_tests.cpp | 568 +++++++++++++ .../cooperativeGrps/multi_gpu_streams.cpp | 581 ++++++++++++++ .../multi_grid_group_all_gpus.cpp | 374 +++++++++ .../simple_grid_group_barrier.cpp | 233 ++++++ .../simple_multi_grid_group_barrier.cpp | 374 +++++++++ .../device/hipDeviceGetPCIBusId.cpp | 346 ++++---- .../src/runtimeApi/device/hipSetGetDevice.cpp | 2 +- .../runtimeApi/memory/hipIpcMemAccessTest.cpp | 227 ++++++ .../memory/hipMallocConcurrency.cpp | 487 ++++++++++++ .../memory/hipMallocManaged_MultiScenario.cpp | 423 ++++++++++ .../src/runtimeApi/memory/hipMemcpyDtoD.cpp | 3 + .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp | 3 + .../src/runtimeApi/memory/hipMemcpyPeer.cpp | 3 + .../runtimeApi/memory/hipMemcpyPeerAsync.cpp | 3 + .../runtimeApi/memory/hipMemcpyWithStream.cpp | 2 +- .../memory/hipMemcpyWithStreamMultiThread.cpp | 2 +- .../src/runtimeApi/memory/hipMemset2D.cpp | 2 +- .../hipMultiMemcpyMultiThrdMultiStrm.cpp | 2 +- .../memory/hipMultiMemcpyMultiThread.cpp | 2 +- .../module/hipFuncSetSharedMemConfig.cpp | 27 + .../hipModuleLoadDataMultThreadOnMultGPU.cpp | 2 +- .../module/hipModuleLoadDataMultThreaded.cpp | 2 +- .../stream/hipStreamACb_AltEnqueue.cpp | 2 +- .../stream/hipStreamACb_MStrm_Mgpu.cpp | 2 +- .../stream/hipStreamACb_MultiCalls.cpp | 2 +- .../stream/hipStreamACb_StrmSyncTiming.cpp | 2 +- .../stream/hipStreamACb_ThrdBehaviour.cpp | 2 +- .../runtimeApi/stream/hipStreamACb_order.cpp | 2 +- .../stream/hipStreamGetPriority.cpp | 2 +- projects/hip/tests/src/test_common.h | 14 +- projects/hip/tests/unit/test_common.h | 1 - 111 files changed, 6800 insertions(+), 753 deletions(-) delete mode 100755 projects/hip/bin/hip_gen_pch.sh create mode 100644 projects/hip/include/hip/nvcc_detail/hip_cooperative_groups.h create mode 100644 projects/hip/samples/0_Intro/bit_extract/CMakeLists.txt create mode 100644 projects/hip/samples/0_Intro/module_api/CMakeLists.txt create mode 100644 projects/hip/samples/0_Intro/module_api_global/CMakeLists.txt create mode 100644 projects/hip/samples/0_Intro/square/CMakeLists.txt create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/CMakeLists.txt create mode 100644 projects/hip/samples/1_Utils/hipCommander/CMakeLists.txt create mode 100644 projects/hip/samples/1_Utils/hipDispatchLatency/CMakeLists.txt create mode 100644 projects/hip/samples/1_Utils/hipInfo/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/10_inline_asm/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/11_texture_driver/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/13_occupancy/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/1_hipEvent/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/3_shared_memory/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/4_shfl/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/5_2dshfl/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/7_streams/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/8_peer2peer/CMakeLists.txt create mode 100644 projects/hip/samples/2_Cookbook/9_unroll/CMakeLists.txt create mode 100644 projects/hip/samples/README.md mode change 100644 => 100755 projects/hip/tests/hit/HIT.cmake create mode 100644 projects/hip/tests/performance/compute/hipPerfMandelbrot.cpp create mode 100644 projects/hip/tests/performance/stream/hipPerfDeviceConcurrency.cpp mode change 100644 => 100755 projects/hip/tests/src/p2p/hipPeerToPeer_simple.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGGridGroupType.cpp (97%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGGridGroupTypeViaBaseType.cpp (97%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGGridGroupTypeViaPublicApi.cpp (97%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGMultiGridGroupType.cpp (92%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGMultiGridGroupTypeViaBaseType.cpp (83%) rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGMultiGridGroupTypeViaPublicApi.cpp (83%) rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGThreadBlockType.cpp (95%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGThreadBlockTypeViaBaseType.cpp (94%) mode change 100644 => 100755 rename projects/hip/tests/src/{cg => runtimeApi/cooperativeGrps}/hipCGThreadBlockTypeViaPublicApi.cpp (94%) mode change 100644 => 100755 rename projects/hip/tests/src/runtimeApi/{module => cooperativeGrps}/hipLaunchCoopMultiKernel.cpp (98%) rename projects/hip/tests/src/runtimeApi/{module => cooperativeGrps}/hipLaunchCooperativeKernel.cpp (94%) create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp create mode 100644 projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp create mode 100644 projects/hip/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp create mode 100644 projects/hip/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp create mode 100644 projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp create mode 100644 projects/hip/tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp mode change 100644 => 100755 projects/hip/tests/src/test_common.h mode change 100644 => 100755 projects/hip/tests/unit/test_common.h diff --git a/projects/hip/CMakeLists.txt b/projects/hip/CMakeLists.txt index c5a49feaa3..0c7156c478 100755 --- a/projects/hip/CMakeLists.txt +++ b/projects/hip/CMakeLists.txt @@ -8,10 +8,15 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib ( set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH) - set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers") +if(NOT DEFINED __HIP_ENABLE_PCH) + set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers") endif() +if(${__HIP_ENABLE_PCH}) + set(_pchStatus 1) +else() + set(_pchStatus 0) +endif() ############################# # Options ############################# @@ -80,8 +85,8 @@ if(GIT_FOUND) set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH}) - if(DEFINED ENV{ROCM_BUILD_ID}) - set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH}) + if(DEFINED ENV{ROCM_LIBPATCH_VERSION}) + set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION}) else() set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH}) endif() @@ -90,6 +95,36 @@ else() set(HIP_PACKAGING_VERSION_PATCH "0") endif() +## Debian package specific variables +if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) + set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) +else() + set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) + +## RPM package specific variables +if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) + set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) +else() + set ( CPACK_RPM_PACKAGE_RELEASE "local" ) +endif() + +## 'dist' breaks manual builds on debian systems due to empty Provides +execute_process( COMMAND rpm --eval %{?dist} + RESULT_VARIABLE PROC_RESULT + OUTPUT_VARIABLE EVAL_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE ) + +if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) + string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) +endif() +message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") + +add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH) +add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE) +add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE) + add_to_config(_versionInfo HIP_VERSION_MAJOR) add_to_config(_versionInfo HIP_VERSION_MINOR) add_to_config(_versionInfo HIP_VERSION_PATCH) @@ -102,7 +137,6 @@ else () set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH}) endif () set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}") - if (DEFINED ENV{ROCM_RPATH}) set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}") set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) @@ -456,6 +490,7 @@ set(_versionInfoHeader #define HIP_VERSION_MINOR ${HIP_VERSION_MINOR} #define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE} #define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n +#define __HIP_HAS_GET_PCH ${_pchStatus}\n #endif\n ") file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader}) @@ -669,8 +704,11 @@ endif() # Testing steps ############################# # Target: test -set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX}) +set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}) +if(HIP_PLATFORM STREQUAL "nvcc") + execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET) +endif() execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) if(${RUN_HIT} EQUAL 0) execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) @@ -713,7 +751,7 @@ endif() ############################# # Target: clang if(HIP_HIPCC_EXECUTABLE) - add_custom_target(analyze + add_custom_target(analyze COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./ WORKING_DIRECTORY ${HIP_SRC_PATH}) if(CPPCHECK_EXE) diff --git a/projects/hip/CONTRIBUTING.md b/projects/hip/CONTRIBUTING.md index d9d353681d..750e6759c2 100644 --- a/projects/hip/CONTRIBUTING.md +++ b/projects/hip/CONTRIBUTING.md @@ -1,15 +1,15 @@ -# Contributor Guidelines +# Contributor Guidelines ## Make Tips -When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm). -This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to +When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm). +This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example ``` cmake .. -DCMAKE_INSTALL_PREFIX=.. make install -export HIP_PATH= +export HIP_PATH= ``` After making HIP, don't forget the "make install" step ! @@ -21,118 +21,110 @@ After making HIP, don't forget the "make install" step ! - Add a translation to the hipify-clang tool ; many examples abound. - For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc). - Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h. - - These are typically headers - - Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h - - Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve - calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for - hipStream_t). + - These are typically headers + - Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h + - Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t). -#### Testing HCC version -In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time -if the current HCC compiler is sufficiently new enough to support the desired feature. The `__hcc_workweek__` compiler -define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0) -(ie 15403, 16014, etc). -The granularity is one day, so __hcc_workweek__ can only be used to distinguish compiler builds that are at least one day apart. +## Check HIP-Clang version +In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature. + +HIP runtime version ``` -#ifdef __hcc_workweek_ > 16014 -// use cool new HCC feature here -#endif +> cat /opt/rocm/hip/bin/.hipVersion +# Auto-generated by cmake +HIP_VERSION_MAJOR=3 +HIP_VERSION_MINOR=9 +HIP_VERSION_PATCH=20345-519ef3f2 ``` -Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4 +HIP-Clang compiler version + ``` -> /opt/rocm/hcc/bin/hcc -v -HCC clang version 3.5.0 (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn) +$ /opt/rocm/llvm/bin/clang -v +clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432) Target: x86_64-unknown-linux-gnu Thread model: posix -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1 -Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 +InstalledDir: /opt/rocm/llvm/bin +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9 +Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9 Candidate multilib: .;@m64 Candidate multilib: 32;@m32 Candidate multilib: x32;@mx32 Selected multilib: .;@m64 ``` -The unix `date` command can print the HCC-format work-week for a specific date , ie: -``` -> date --utc +%y%U%w -d 2015-11-09 -15451 -``` - ## Unit Testing Environment -HIP includes unit tests in the tests/src directory. +HIP includes unit tests in the tests/src directory. When adding a new HIP feature, add a new unit test as well. See [tests/README.md](README.md) for more information. ## Development Flow -It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source. -Directed tests provide a great place to develop new features alongside the associated test. + +Directed tests provide a great place to develop new features alongside the associated test. For applications and benchmarks outside the directed test environment, developments should use a two-step development flow: -- #1. Compile, link, and install HCC. See [Installation](README.md#Installation) notes. -- #2. Relink the target application to include changes in the libhip_hcc.a file. +- #1. Compile, link, and install HIP/ROCclr. See [Installation](README.md#Installation) notes. +- #2. Relink the target application to include changes in HIP runtime file. ## Environment Variables -- **HIP_PATH** : Location of HIP include, src, bin, lib directories. -- **HCC_HOME** : Path to HCC compiler. Default /opt/rocm/hcc. +- **HIP_PATH** : Location of HIP include, src, bin, lib directories. +- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms. Default /opt/rocm/rocclr. - **HSA_PATH** : Path to HSA include, lib. Default /opt/rocm/hsa. - **CUDA_PATH* : On nvcc system, this points to root of CUDA installation. -### Contribution guidelines ### +## Contribution guidelines ## Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs. The HIP interface is designed to be very familiar for CUDA programmers. -Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. +Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. -## Coding Guidelines (in brief) +### Coding Guidelines (in brief) - Code Indentation: - Tabs should be expanded to spaces. - Use 4 spaces indentation. - Capitalization and Naming - - Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator. + - Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator. This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational. - Member variables should begin with a leading "_". This allows them to be easily distinguished from other variables or functions. - - {} placement - For functions, the opening { should be placed on a new line. - For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else. Example ''' if (foo) { - doFoo() - } else { + doFoo() + } else { doFooElse(); } ''' - namespace should be on same line as { and separated by a space. - Single-line if statement should still use {/} pair (even though C++ does not require). - Miscellaneous - - All references in function parameter lists should be const. + - All references in function parameter lists should be const. - "ihip" = internal hip structures. These should not be exposed through the HIP API. - Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs. - FIXME refers to a short-term bug that needs to be addressed. - HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized, and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match - those of the parent function. -- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code + those of the parent function. +- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly. - All HIP environment variables should begin with the keyword HIP_ Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores. To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform . - HIPCC or other tools may support additional environment variables which should follow the above convention. + HIPCC or other tools may support additional environment variables which should follow the above convention. - -#### Presubmit Testing: -Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests. +### Presubmit Testing: +Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests. Ensure pass results match starting point: ```shell @@ -141,13 +133,13 @@ Ensure pass results match starting point: ``` -#### Checkin messages +### Checkin messages Follow existing best practice for writing a good Git commit message. Some tips: http://chris.beams.io/posts/git-commit/ https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message -In particular : - - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc". +In particular : + - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc". Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc. - Subject should summarize the commit. Do not end subject with a period. Use a blank line after the subject. diff --git a/projects/hip/bin/hip_embed_pch.sh b/projects/hip/bin/hip_embed_pch.sh index 8fe3c20f98..0f2cbabd84 100755 --- a/projects/hip/bin/hip_embed_pch.sh +++ b/projects/hip/bin/hip_embed_pch.sh @@ -1,8 +1,7 @@ #!/bin/bash #set -x - -ROCM_PATH=${ROCM_PATH:-/opt/rocm} +LLVM_DIR="$1/../../../" tmp=/tmp/hip_pch.$$ mkdir -p $tmp @@ -47,12 +46,12 @@ __hip_pch_size: .long __hip_pch_size - __hip_pch EOF -$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui +$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui cat $tmp/hip_macros.h >> $tmp/pch.cui -$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui +$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui -$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj +$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj rm -rf $tmp diff --git a/projects/hip/bin/hip_gen_pch.sh b/projects/hip/bin/hip_gen_pch.sh deleted file mode 100755 index b212177119..0000000000 --- a/projects/hip/bin/hip_gen_pch.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -#set -x - -cat >/tmp/hip_macros.h </tmp/hip_pch.h </tmp/pch.cui - -cat /tmp/hip_macros.h >> /tmp/pch.cui - -/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - (tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl(static_cast(var), src_lane, width)); + #endif +} +__device__ +inline long long __shfl(long long var, int src_lane, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) { + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - __device__ + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ inline int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { int self = __lane_id(); @@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) return static_cast(__shfl_up(static_cast(var), lane_delta, width)); #endif } + +__device__ +inline +unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_up(static_cast(var), lane_delta, width)); + #endif +} + __device__ inline long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize) @@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize return tmp1; } +__device__ +inline +unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + __device__ inline int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { @@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) } __device__ inline +unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif +} +__device__ +inline long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} __device__ inline @@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize) } __device__ inline +unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); + #endif +} +__device__ +inline long long __shfl_xor(long long var, int lane_mask, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } - +__device__ +inline +unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index 0e5820a016..0a173bb466 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -487,6 +487,22 @@ struct __HIP_Coordinates { #endif }; +template +#if !defined(_MSC_VER) +__attribute__((weak)) +#endif +constexpr typename __HIP_Coordinates::X __HIP_Coordinates::x; +template +#if !defined(_MSC_VER) +__attribute__((weak)) +#endif +constexpr typename __HIP_Coordinates::Y __HIP_Coordinates::y; +template +#if !defined(_MSC_VER) +__attribute__((weak)) +#endif +constexpr typename __HIP_Coordinates::Z __HIP_Coordinates::z; + extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint); inline __device__ diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 37fcccf192..74c0fb5f69 100755 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -345,13 +345,16 @@ typedef struct hipLaunchParams_t { hipStream_t stream; ///< Stream identifier } hipLaunchParams; -// Pre-Compiled header for online compilation -#ifdef ENABLE_HIP_PCH -extern const char* __hip_pch; -extern unsigned __hip_pch_size; -void __hipGetPCH(const char** pch, unsigned int*size); +#if __HIP_HAS_GET_PCH +/** + * Internal use only. This API may change in the future + * Pre-Compiled header for online compilation + * + */ + void __hipGetPCH(const char** pch, unsigned int*size); #endif + // Doxygen end group GlobalDefs /** @} */ diff --git a/projects/hip/include/hip/hip_cooperative_groups.h b/projects/hip/include/hip/hip_cooperative_groups.h index d919e83c7f..41f36378bb 100644 --- a/projects/hip/include/hip/hip_cooperative_groups.h +++ b/projects/hip/include/hip/hip_cooperative_groups.h @@ -28,14 +28,17 @@ THE SOFTWARE. */ #ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H -#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H +#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H + +#include +#include #if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) -#if __cplusplus +#if __cplusplus && defined(__clang__) && defined(__HIP__) #include #endif #elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__) -#include +#include #else #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__"); #endif diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index 4412bbd7da..3a26fb74f4 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -32,6 +32,7 @@ THE SOFTWARE. #include // for getDeviceProp +#include #include enum { diff --git a/projects/hip/include/hip/nvcc_detail/hip_cooperative_groups.h b/projects/hip/include/hip/nvcc_detail/hip_cooperative_groups.h new file mode 100644 index 0000000000..113e600eec --- /dev/null +++ b/projects/hip/include/hip/nvcc_detail/hip_cooperative_groups.h @@ -0,0 +1,12 @@ +#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H +#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H + +// Include CUDA headers +#include +#include + +// Include HIP wrapper headers around CUDA +#include +#include + +#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime.h b/projects/hip/include/hip/nvcc_detail/hip_runtime.h index c13540df54..e7c3eaf32a 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime.h @@ -104,13 +104,13 @@ typedef int hipLaunchParm; #define HIP_DYNAMIC_SHARED_ATTRIBUTE #ifdef __HIP_DEVICE_COMPILE__ -#define abort() \ +#define abort_() \ { asm("trap;"); } #undef assert #define assert(COND) \ { \ if (!COND) { \ - abort(); \ + abort_(); \ } \ } #endif diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index faa0bf7d7b..ce1469804e 100755 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include #include +#include #ifdef __cplusplus extern "C" { diff --git a/projects/hip/lpl_ca/CMakeLists.txt b/projects/hip/lpl_ca/CMakeLists.txt index c272273c09..2473fbc254 100644 --- a/projects/hip/lpl_ca/CMakeLists.txt +++ b/projects/hip/lpl_ca/CMakeLists.txt @@ -20,6 +20,7 @@ target_include_directories(lpl target_compile_options(lpl PUBLIC -Wall) target_link_libraries(lpl PUBLIC pthread) +add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl) install(TARGETS lpl RUNTIME DESTINATION bin) #-------------------------------------LPL--------------------------------------# @@ -43,6 +44,7 @@ find_package(hsa-runtime64 REQUIRED CONFIG target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 ) target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall) +add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca) install(TARGETS ca RUNTIME DESTINATION bin) #-------------------------------------CA---------------------------------------# diff --git a/projects/hip/packaging/hip-base.txt b/projects/hip/packaging/hip-base.txt index 7ba7d3b93a..9b10ec2c3f 100644 --- a/projects/hip/packaging/hip-base.txt +++ b/projects/hip/packaging/hip-base.txt @@ -21,22 +21,23 @@ set(CPACK_PACKAGE_NAME "hip-base") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) -set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") diff --git a/projects/hip/packaging/hip-doc.txt b/projects/hip/packaging/hip-doc.txt index 911f2486fd..30f05cb6e6 100644 --- a/projects/hip/packaging/hip-doc.txt +++ b/projects/hip/packaging/hip-doc.txt @@ -24,25 +24,26 @@ set(CPACK_PACKAGE_NAME "hip-doc") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) -set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/projects/hip/packaging/hip-hcc.txt b/projects/hip/packaging/hip-hcc.txt index d084e8d966..a17bd8ca86 100644 --- a/projects/hip/packaging/hip-hcc.txt +++ b/projects/hip/packaging/hip-hcc.txt @@ -28,24 +28,29 @@ endif() set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") + set(CPACK_BINARY_DEB "ON") +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc") + set(CPACK_BINARY_RPM "ON") +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/projects/hip/packaging/hip-nvcc.txt b/projects/hip/packaging/hip-nvcc.txt index 5d3d91ffb6..f5d43533dc 100644 --- a/projects/hip/packaging/hip-nvcc.txt +++ b/projects/hip/packaging/hip-nvcc.txt @@ -10,28 +10,29 @@ set(CPACK_PACKAGE_NAME "hip-nvcc") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/projects/hip/packaging/hip-rocclr.txt b/projects/hip/packaging/hip-rocclr.txt index 6f5c16bb96..ee5ec0c3db 100644 --- a/projects/hip/packaging/hip-rocclr.txt +++ b/projects/hip/packaging/hip-rocclr.txt @@ -33,27 +33,28 @@ set(HCC_PACKAGE_NAME "rocclr") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), comgr (>= 1.1), llvm-amdgpu") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") +set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, comgr >= 1.1, llvm-amdgpu") set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") diff --git a/projects/hip/packaging/hip-samples.txt b/projects/hip/packaging/hip-samples.txt index 6481cf7bde..34f0dddd2e 100644 --- a/projects/hip/packaging/hip-samples.txt +++ b/projects/hip/packaging/hip-samples.txt @@ -12,25 +12,26 @@ set(CPACK_PACKAGE_NAME "hip-samples") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") +set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) +set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/projects/hip/rocclr/CMakeLists.txt b/projects/hip/rocclr/CMakeLists.txt index 187edab746..ec1dc50407 100755 --- a/projects/hip/rocclr/CMakeLists.txt +++ b/projects/hip/rocclr/CMakeLists.txt @@ -96,6 +96,14 @@ find_package(amd_comgr REQUIRED CONFIG message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") +find_package(LLVM REQUIRED CONFIG + PATHS + /opt/rocm/llvm + PATH_SUFFIXES + lib/cmake/llvm) + +message(STATUS "llvm found at ${LLVM_DIR}.") + add_library(hip64 OBJECT hip_context.cpp hip_code_object.cpp @@ -148,10 +156,9 @@ endif() # Short-Term solution for pre-compiled headers for online compilation # Enable pre compiled header -if(${ENABLE_HIP_PCH}) - execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh") - execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh") - add_definitions(-DENABLE_HIP_PCH) +if(${__HIP_ENABLE_PCH}) + execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}") + add_definitions(-D__HIP_ENABLE_PCH) endif() # Enable profiling API @@ -216,7 +223,7 @@ add_library(device INTERFACE) target_link_libraries(device INTERFACE host) # Short-Term solution for pre-compiled headers for online compilation -if(${ENABLE_HIP_PCH}) +if(${__HIP_ENABLE_PCH}) target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o) endif() @@ -227,6 +234,18 @@ endif() # filename. if(${BUILD_SHARED_LIBS}) target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64) + + add_custom_command(TARGET amdhip64 POST_BUILD COMMAND + ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING} + ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}) + add_custom_command(TARGET amdhip64 POST_BUILD COMMAND + ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR} + ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so) + add_custom_command(TARGET amdhip64 POST_BUILD COMMAND + ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo) + add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory + ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include) + INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) else() target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr) @@ -244,6 +263,7 @@ else() INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) endif() + INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::) diff --git a/projects/hip/rocclr/hip_code_object.cpp b/projects/hip/rocclr/hip_code_object.cpp index c6a866c9c4..b0979c5246 100755 --- a/projects/hip/rocclr/hip_code_object.cpp +++ b/projects/hip/rocclr/hip_code_object.cpp @@ -202,19 +202,10 @@ hipError_t DynCO::populateDynGlobalVars() { return hipErrorSharedObjectSymbolNotFound; } - if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) { - DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module()); - return hipErrorSharedObjectSymbolNotFound; - } - for (auto& elem : var_names) { vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr))); } - for (auto& elem : undef_var_names) { - vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr))); - } - return hipSuccess; } @@ -377,20 +368,4 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice *size_ptr = dvar->size(); return hipSuccess; } - -hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, - hipDeviceptr_t* dev_ptr, size_t* size_ptr) { - amd::ScopedLock lock(sclock_); - - for (auto& elem : vars_) { - if ((elem.second->name() == hostVar) - && (elem.second->module(deviceId) == hmod)) { - *dev_ptr = elem.second->device_ptr(deviceId); - *size_ptr = elem.second->device_size(deviceId); - return hipSuccess; - } - } - - return hipErrorNotFound; -} }; //namespace: hip diff --git a/projects/hip/rocclr/hip_code_object.hpp b/projects/hip/rocclr/hip_code_object.hpp index f5f179570b..0cc2a7051a 100755 --- a/projects/hip/rocclr/hip_code_object.hpp +++ b/projects/hip/rocclr/hip_code_object.hpp @@ -118,8 +118,6 @@ public: hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId); hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr, size_t* size_ptr); - hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, - hipDeviceptr_t* dev_ptr, size_t* size_ptr); private: friend class ::PlatformState; diff --git a/projects/hip/rocclr/hip_device.cpp b/projects/hip/rocclr/hip_device.cpp index 70548d5328..c0dbc89970 100644 --- a/projects/hip/rocclr/hip_device.cpp +++ b/projects/hip/rocclr/hip_device.cpp @@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) ::strncpy(deviceProps.name, info.boardName_, 128); deviceProps.totalGlobalMem = info.globalMemSize_; deviceProps.sharedMemPerBlock = info.localMemSizePerCU_; - deviceProps.regsPerBlock = info.availableSGPRs_; + deviceProps.regsPerBlock = info.availableRegistersPerCU_; deviceProps.warpSize = info.wavefrontWidth_; deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_; deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0]; diff --git a/projects/hip/rocclr/hip_fatbin.cpp b/projects/hip/rocclr/hip_fatbin.cpp index 95a91063a2..8072c18b36 100755 --- a/projects/hip/rocclr/hip_fatbin.cpp +++ b/projects/hip/rocclr/hip_fatbin.cpp @@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() { } FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) - : fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) { + : fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) { guarantee(fname || image); if (fname != nullptr) { @@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() { } fname_ = std::string(); - fdesc_ = -1; + fdesc_ = amd::Os::FDescInit(); fsize_ = 0; image_ = nullptr; uri_ = std::string(); @@ -64,6 +64,9 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector& devi if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) { return hipErrorFileNotFound; } + if (fsize_ == 0) { + return hipErrorInvalidKernelFile; + } // Extract the code object from file hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, diff --git a/projects/hip/rocclr/hip_global.cpp b/projects/hip/rocclr/hip_global.cpp index bed2dcd850..46e6efcf52 100755 --- a/projects/hip/rocclr/hip_global.cpp +++ b/projects/hip/rocclr/hip_global.cpp @@ -5,7 +5,9 @@ #include "hip_code_object.hpp" #include "platform/program.hpp" -#ifdef ENABLE_HIP_PCH +#ifdef __HIP_ENABLE_PCH +extern const char __hip_pch[]; +extern unsigned __hip_pch_size; void __hipGetPCH(const char** pch, unsigned int *size) { *pch = __hip_pch; *size = __hip_pch_size; diff --git a/projects/hip/rocclr/hip_global.hpp b/projects/hip/rocclr/hip_global.hpp index 3888daf30b..fd57ecfb50 100755 --- a/projects/hip/rocclr/hip_global.hpp +++ b/projects/hip/rocclr/hip_global.hpp @@ -95,11 +95,6 @@ public: hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId); void resize_dVar(size_t size) { dVar_.resize(size); } - //Accessor for device_ptrs. - std::string name() const { return name_; } - hipModule_t module(int deviceId) const { return nullptr; } - hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); } - size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); } FatBinaryInfo** moduleInfo() { return modules_; }; private: diff --git a/projects/hip/rocclr/hip_internal.hpp b/projects/hip/rocclr/hip_internal.hpp index a950961ea7..7e0cc8b9a2 100755 --- a/projects/hip/rocclr/hip_internal.hpp +++ b/projects/hip/rocclr/hip_internal.hpp @@ -252,8 +252,6 @@ extern int ihipGetDevice(); extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags); extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset); extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size); -extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr, - size_t* var_size); constexpr bool kOptionChangeable = true; constexpr bool kNewDevProg = false; diff --git a/projects/hip/rocclr/hip_memory.cpp b/projects/hip/rocclr/hip_memory.cpp index b0e1d6abdd..8fd9b05cdb 100755 --- a/projects/hip/rocclr/hip_memory.cpp +++ b/projects/hip/rocclr/hip_memory.cpp @@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if (*ptr == nullptr) { size_t free = 0, total =0; hipMemGetInfo(&free, &total); - LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total); + LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total); return hipErrorOutOfMemory; } @@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin } } else { amd::HostQueue* pQueue = &queue; - if (queueDevice != srcMemory->getContext().devices()[0]) { + if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) && + (queueDevice != srcMemory->getContext().devices()[0])) { pQueue = hip::getNullStream(srcMemory->getContext()); amd::Command* cmd = queue.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } - command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); } @@ -1850,18 +1850,27 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, hipExtent extent, hipStream_t stream, bool isAsync = false) { - if (pitchedDevPtr.pitch == extent.width) { - return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync); - } - - // Workaround for cases when pitch > row untill fill kernel will be updated to support pitch. - // Fallback to filling one row at a time. - - amd::HostQueue* queue = hip::getQueue(stream); - size_t offset = 0; amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset); + auto sizeBytes = extent.width * extent.height * extent.depth; + + if (memory == nullptr) { + return hipErrorInvalidValue; + } + if (sizeBytes > memory->getSize()) { + return hipErrorInvalidValue; + } + + if (pitchedDevPtr.pitch == extent.width) { + return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast(sizeBytes), stream, isAsync); + } + + // Workaround for cases when pitch > row until fill kernel will be updated to support pitch. + // Fall back to filling one row at a time. + + amd::HostQueue* queue = hip::getQueue(stream); + amd::Coord3D origin(offset); amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth); amd::BufferRect rect; @@ -1870,34 +1879,26 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, return hipErrorInvalidValue; } - if (memory != nullptr) { - std::vector commands; + std::vector commands; - for (size_t slice = 0; slice < extent.depth; slice++) { - for (size_t row = 0; row < extent.height; row++) { - const size_t rowOffset = rect.offset(0, row, slice); - amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue, - CL_COMMAND_FILL_BUFFER, - amd::Command::EventWaitList{}, - *memory->asBuffer(), - &value, - sizeof(int8_t), - amd::Coord3D{rowOffset, 0, 0}, - amd::Coord3D{extent.width, 1, 1}); + for (size_t slice = 0; slice < extent.depth; slice++) { + for (size_t row = 0; row < extent.height; row++) { + const size_t rowOffset = rect.offset(0, row, slice); + amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue, + CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { }, + *memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset, + 0, 0 }, amd::Coord3D { extent.width, 1, 1 }); - command->enqueue(); - commands.push_back(command); - } + command->enqueue(); + commands.push_back(command); } + } - for (auto &command: commands) { - if (!isAsync) { - command->awaitCompletion(); - } - command->release(); + for (auto &command : commands) { + if (!isAsync) { + command->awaitCompletion(); } - } else { - return hipErrorInvalidValue; + command->release(); } return hipSuccess; @@ -2038,7 +2039,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void memset(attributes, 0, sizeof(hipPointerAttribute_t)); if (memObj != nullptr) { - attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; + attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; if (attributes->memoryType == hipMemoryTypeHost) { attributes->hostPointer = static_cast(memObj->getSvmPtr()) + offset; } diff --git a/projects/hip/rocclr/hip_module.cpp b/projects/hip/rocclr/hip_module.cpp index 4a09cc6ed0..b72ee1a5a2 100755 --- a/projects/hip/rocclr/hip_module.cpp +++ b/projects/hip/rocclr/hip_module.cpp @@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL if (result != hipSuccess) { break; } - prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; + prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ; } // Sync the execution streams on all devices diff --git a/projects/hip/rocclr/hip_peer.cpp b/projects/hip/rocclr/hip_peer.cpp index ded6843957..fe22803c33 100755 --- a/projects/hip/rocclr/hip_peer.cpp +++ b/projects/hip/rocclr/hip_peer.cpp @@ -97,6 +97,10 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount); + if (linktype == nullptr || hopcount == nullptr || + device1 == device2 || device1 < 0 || device2 < 0) { + HIP_RETURN(hipErrorInvalidValue); + } // Fill out the list of LinkAttributes std::vector link_attrs; link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0)); diff --git a/projects/hip/rocclr/hip_platform.cpp b/projects/hip/rocclr/hip_platform.cpp index 6e6f08bf44..6abea0df4e 100755 --- a/projects/hip/rocclr/hip_platform.cpp +++ b/projects/hip/rocclr/hip_platform.cpp @@ -80,27 +80,6 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data) return PlatformState::instance().addFatBinary(fbwrapper->binary); } -bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod, - void** var_addr, size_t* var_size) { - - amd::ScopedLock lock(lock_); - if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) { - return true; - } - - if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) { - return true; - } - - return false; -} - -bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr, - size_t* var_size) { - return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast(program), - var_addr, var_size); -} - extern "C" void __hipRegisterFunction( hip::FatBinaryInfo** modules, const void* hostFunction, @@ -686,11 +665,19 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept { return s | v; } -extern "C" __attribute__((weak)) float __gnu_h2f_ieee(unsigned short h){ +extern "C" +#if !defined(_MSC_VER) +__attribute__((weak)) +#endif +float __gnu_h2f_ieee(unsigned short h){ return __convert_half_to_float((std::uint32_t) h); } -extern "C" __attribute__((weak)) unsigned short __gnu_f2h_ieee(float f){ +extern "C" +#if !defined(_MSC_VER) +__attribute__((weak)) +#endif +unsigned short __gnu_f2h_ieee(float f){ return (unsigned short)__convert_float_to_half(f); } @@ -765,6 +752,9 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod, DevLogPrintfError("Cannot find the module: 0x%x", hmod); return hipErrorNotFound; } + if (0 == strlen(func_name)) { + return hipErrorNotFound; + } return it->second->getDynFunc(hfunc, func_name); } @@ -868,11 +858,6 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr); } -hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, - hipDeviceptr_t* dev_ptr, size_t* size_ptr) { - return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr); -} - void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) { auto& arguments = execStack_.top().arguments_; diff --git a/projects/hip/rocclr/hip_platform.hpp b/projects/hip/rocclr/hip_platform.hpp index 2bcf620f6d..51fea0841e 100755 --- a/projects/hip/rocclr/hip_platform.hpp +++ b/projects/hip/rocclr/hip_platform.hpp @@ -77,11 +77,6 @@ public: hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId); hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr, size_t* size_ptr); - hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, - hipDeviceptr_t* dev_ptr, size_t* size_ptr); - - bool getShadowVarInfo(std::string var_name, hipModule_t hmod, - void** var_addr, size_t* var_size); //Exec Functions void setupArgument(const void *arg, size_t size, size_t offset); diff --git a/projects/hip/samples/0_Intro/bit_extract/CMakeLists.txt b/projects/hip/samples/0_Intro/bit_extract/CMakeLists.txt new file mode 100644 index 0000000000..c9b13be812 --- /dev/null +++ b/projects/hip/samples/0_Intro/bit_extract/CMakeLists.txt @@ -0,0 +1,20 @@ +project(bit_extract) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(bit_extract bit_extract.cpp) + +# Link with HIP +target_link_libraries(bit_extract hip::host) \ No newline at end of file diff --git a/projects/hip/samples/0_Intro/bit_extract/Makefile b/projects/hip/samples/0_Intro/bit_extract/Makefile index 4a3a0bb4fe..3427815ffc 100644 --- a/projects/hip/samples/0_Intro/bit_extract/Makefile +++ b/projects/hip/samples/0_Intro/bit_extract/Makefile @@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc # Show how to use PLATFORM to specify different options for each compiler: ifeq (${HIP_PLATFORM}, nvcc) - HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 + HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif EXE=bit_extract -EXE_STATIC=bit_extract_static $(EXE): bit_extract.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ -$(EXE_STATIC): bit_extract.cpp - $(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@ - -all: $(EXE) $(EXE_STATIC) +all: $(EXE) clean: - rm -f *.o $(EXE) $(EXE_STATIC) + rm -f *.o $(EXE) diff --git a/projects/hip/samples/0_Intro/module_api/CMakeLists.txt b/projects/hip/samples/0_Intro/module_api/CMakeLists.txt new file mode 100644 index 0000000000..0f5cc32f91 --- /dev/null +++ b/projects/hip/samples/0_Intro/module_api/CMakeLists.txt @@ -0,0 +1,36 @@ +project(module_api) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(runKernel.hip.out runKernel.cpp) +add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp) +add_executable(defaultDriver.hip.out defaultDriver.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(runKernel.hip.out codeobj) +add_dependencies(launchKernelHcc.hip.out codeobj) +add_dependencies(defaultDriver.hip.out codeobj) + +# Link with HIP +target_link_libraries(runKernel.hip.out hip::host) +target_link_libraries(launchKernelHcc.hip.out hip::host) +target_link_libraries(defaultDriver.hip.out hip::host) diff --git a/projects/hip/samples/0_Intro/module_api_global/CMakeLists.txt b/projects/hip/samples/0_Intro/module_api_global/CMakeLists.txt new file mode 100644 index 0000000000..00caa79cfa --- /dev/null +++ b/projects/hip/samples/0_Intro/module_api_global/CMakeLists.txt @@ -0,0 +1,30 @@ +project(modile_api_global) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(runKernel.hip.out runKernel.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(runKernel.hip.out codeobj) + +# Link with HIP +target_link_libraries(runKernel.hip.out hip::host) \ No newline at end of file diff --git a/projects/hip/samples/0_Intro/square/CMakeLists.txt b/projects/hip/samples/0_Intro/square/CMakeLists.txt new file mode 100644 index 0000000000..845c43fd1f --- /dev/null +++ b/projects/hip/samples/0_Intro/square/CMakeLists.txt @@ -0,0 +1,21 @@ +#Follow "README.md" to generate square.cpp if it's missing + +project(square) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(square square.cpp) + +# Link with HIP +target_link_libraries(square hip::host) \ No newline at end of file diff --git a/projects/hip/samples/0_Intro/square/Makefile b/projects/hip/samples/0_Intro/square/Makefile index aa046eeaaa..9bb0dd8205 100644 --- a/projects/hip/samples/0_Intro/square/Makefile +++ b/projects/hip/samples/0_Intro/square/Makefile @@ -11,7 +11,7 @@ else SOURCES=square.cpp endif -all: square.out square.out.static +all: square.out # Step square.cpp: square.cu @@ -20,8 +20,5 @@ square.cpp: square.cu square.out: $(SOURCES) $(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@ -square.out.static: $(SOURCES) - $(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@ - clean: - rm -f *.o *.out *.out.static square.cpp + rm -f *.o *.out square.cpp diff --git a/projects/hip/samples/0_Intro/square/README.md b/projects/hip/samples/0_Intro/square/README.md index c185903993..0bbb2f7e39 100644 --- a/projects/hip/samples/0_Intro/square/README.md +++ b/projects/hip/samples/0_Intro/square/README.md @@ -1,13 +1,39 @@ # Square.md -Simple test which shows how to use hipify-perl to port CUDA code to HIP. -See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. +Simple test which shows how to use hipify-perl to port CUDA code to HIP. +See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile: -1. Add hip/bin path to the PATH : - export PATH=$PATH:[MYHIP]/bin +- Add hip/bin path to the PATH -2. $ make - Make runs these steps. This can be performed on either CUDA or AMD platform: - hipify-perl square.cu > square.cpp # convert cuda code to hip code - hipcc square.cpp # compile into executable +``` +$ export PATH=$PATH:[MYHIP]/bin +``` + +- Define environment variable + +``` +$ export HIP_PATH=[MYHIP] +``` + +- Build executible file + +``` +$ cd ~/hip/samples/0_Intro/square +$ make +/home/user/hip/bin/hipify-perl square.cu > square.cpp +/home/user/hip/bin/hipcc square.cpp -o square.out +/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static +``` +- Execute file +``` +$ ./square.out +info: running on device Navi 14 [Radeon Pro W5500] +info: allocate host mem ( 7.63 MB) +info: allocate device mem ( 7.63 MB) +info: copy Host2Device +info: launch 'vector_square' kernel +info: copy Device2Host +info: check result +PASSED! +``` diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/CMakeLists.txt b/projects/hip/samples/1_Utils/hipBusBandwidth/CMakeLists.txt new file mode 100644 index 0000000000..df01c31d97 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipBusBandwidth) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp) + +# Link with HIP +target_link_libraries(hipBusBandwidth hip::host) \ No newline at end of file diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 6181c49afe..8032bd0a20 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered }; bool p_verbose = false; MallocMode p_malloc_mode = MallocPinned; int p_numa_ctl = -1; -int p_iterations = 10; +int p_iterations = 0; int p_beatsperiteration = 1; int p_device = 0; int p_detailed = 0; @@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288}; int nSizes = sizeof(sizes) / sizeof(int); - +// iterations to be run for the corresponding sizes, less number as the size increases +int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200, + 200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20}; // **************************************************************************** // Function: RunBenchmark_H2D @@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); - } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, - "ms", t); - - if (p_onesize) { - break; - } + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { @@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); - } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); - if (p_onesize) { - break; - } + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { @@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); - hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); + hipEventRecord(start, 0); + hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); + hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - resultDB.AddResult( - std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, - "GB/sec", speed); - resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; } + + double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + resultDB.AddResult( + std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, + "GB/sec", speed); + resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); + } + if (p_onesize) { + break; + } } // Cleanup @@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); - } + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); + } - hipEventRecord(stop, 0); + hipEventRecord(stop, 0); - hipEventSynchronize(stop); + hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); - resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "ms", t); - if (p_onesize) { - break; - } + } + if (p_onesize) { + break; } } @@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, - hipMemcpyDeviceToDevice, stream[0]); - hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, - hipMemcpyDeviceToDevice, stream[1]); - } - - hipEventRecord(stop, 0); - - hipEventSynchronize(stop); - - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / - t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); - - resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "ms", t); - - if (p_onesize) { - break; - } + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, + hipMemcpyDeviceToDevice, stream[0]); + hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, + hipMemcpyDeviceToDevice, stream[1]); } + + hipEventRecord(stop, 0); + + hipEventSynchronize(stop); + + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / + t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { diff --git a/projects/hip/samples/1_Utils/hipCommander/CMakeLists.txt b/projects/hip/samples/1_Utils/hipCommander/CMakeLists.txt new file mode 100644 index 0000000000..2592020c66 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipCommander/CMakeLists.txt @@ -0,0 +1,31 @@ +project(hipCommander) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipCommander hipCommander.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco + COMMENT "codeobj generated" +) + +add_dependencies(hipCommander codeobj) + +# Link with HIP +target_link_libraries(hipCommander hip::host) +set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11) diff --git a/projects/hip/samples/1_Utils/hipDispatchLatency/CMakeLists.txt b/projects/hip/samples/1_Utils/hipDispatchLatency/CMakeLists.txt new file mode 100644 index 0000000000..b267f91905 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipDispatchLatency/CMakeLists.txt @@ -0,0 +1,35 @@ +project(hipDispatchLatency) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipDispatchLatency hipDispatchLatency.cpp) +add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(hipDispatchLatency codeobj) +add_dependencies(hipDispatchEnqueueRateMT codeobj) + +# Link with HIP +target_link_libraries(hipDispatchLatency hip::host) +target_link_libraries(hipDispatchEnqueueRateMT hip::host) +set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11) +set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11) diff --git a/projects/hip/samples/1_Utils/hipInfo/CMakeLists.txt b/projects/hip/samples/1_Utils/hipInfo/CMakeLists.txt new file mode 100644 index 0000000000..f3678d3160 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipInfo/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipInfo) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipInfo hipInfo.cpp) + +# Link with HIP +target_link_libraries(hipInfo hip::host) diff --git a/projects/hip/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt b/projects/hip/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt new file mode 100644 index 0000000000..de5bb0b5ea --- /dev/null +++ b/projects/hip/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt @@ -0,0 +1,20 @@ +project(MatrixTranspose) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(MatrixTranspose MatrixTranspose.cpp) + +# Link with HIP +target_link_libraries(MatrixTranspose hip::host) diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/CMakeLists.txt b/projects/hip/samples/2_Cookbook/10_inline_asm/CMakeLists.txt new file mode 100644 index 0000000000..7adb51f5de --- /dev/null +++ b/projects/hip/samples/2_Cookbook/10_inline_asm/CMakeLists.txt @@ -0,0 +1,20 @@ +project(inline_asm) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(inline_asm inline_asm.cpp) + +# Link with HIP +target_link_libraries(inline_asm hip::host) diff --git a/projects/hip/samples/2_Cookbook/11_texture_driver/CMakeLists.txt b/projects/hip/samples/2_Cookbook/11_texture_driver/CMakeLists.txt new file mode 100644 index 0000000000..8ff242c993 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/11_texture_driver/CMakeLists.txt @@ -0,0 +1,30 @@ +project(texture2dDrv) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(texture2dDrv texture2dDrv.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code + COMMENT "codeobj generated" +) + +add_dependencies(texture2dDrv codeobj) + +# Link with HIP +target_link_libraries(texture2dDrv hip::host) diff --git a/projects/hip/samples/2_Cookbook/13_occupancy/CMakeLists.txt b/projects/hip/samples/2_Cookbook/13_occupancy/CMakeLists.txt new file mode 100644 index 0000000000..6cad76a395 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/13_occupancy/CMakeLists.txt @@ -0,0 +1,20 @@ +project(occupancy) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(occupancy occupancy.cpp) + +# Link with HIP +target_link_libraries(occupancy hip::host) diff --git a/projects/hip/samples/2_Cookbook/1_hipEvent/CMakeLists.txt b/projects/hip/samples/2_Cookbook/1_hipEvent/CMakeLists.txt new file mode 100644 index 0000000000..6f6ee4e050 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/1_hipEvent/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipEvent) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipEvent hipEvent.cpp) + +# Link with HIP +target_link_libraries(hipEvent hip::host) diff --git a/projects/hip/samples/2_Cookbook/3_shared_memory/CMakeLists.txt b/projects/hip/samples/2_Cookbook/3_shared_memory/CMakeLists.txt new file mode 100644 index 0000000000..6401488628 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/3_shared_memory/CMakeLists.txt @@ -0,0 +1,20 @@ +project(sharedMemory) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(sharedMemory sharedMemory.cpp) + +# Link with HIP +target_link_libraries(sharedMemory hip::host) diff --git a/projects/hip/samples/2_Cookbook/4_shfl/CMakeLists.txt b/projects/hip/samples/2_Cookbook/4_shfl/CMakeLists.txt new file mode 100644 index 0000000000..9d142eeb02 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/4_shfl/CMakeLists.txt @@ -0,0 +1,20 @@ +project(shfl) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(shfl shfl.cpp) + +# Link with HIP +target_link_libraries(shfl hip::host) diff --git a/projects/hip/samples/2_Cookbook/5_2dshfl/CMakeLists.txt b/projects/hip/samples/2_Cookbook/5_2dshfl/CMakeLists.txt new file mode 100644 index 0000000000..adc0e3595d --- /dev/null +++ b/projects/hip/samples/2_Cookbook/5_2dshfl/CMakeLists.txt @@ -0,0 +1,19 @@ +project(2dshfl) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(2dshfl 2dshfl.cpp) + +# Link with HIP +target_link_libraries(2dshfl hip::host) diff --git a/projects/hip/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt b/projects/hip/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt new file mode 100644 index 0000000000..f177952d5a --- /dev/null +++ b/projects/hip/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt @@ -0,0 +1,19 @@ +project(dynamic_shared) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(dynamic_shared dynamic_shared.cpp) + +# Link with HIP +target_link_libraries(dynamic_shared hip::host) diff --git a/projects/hip/samples/2_Cookbook/7_streams/CMakeLists.txt b/projects/hip/samples/2_Cookbook/7_streams/CMakeLists.txt new file mode 100644 index 0000000000..fac4187b47 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/7_streams/CMakeLists.txt @@ -0,0 +1,19 @@ +project(stream) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(stream stream.cpp) + +# Link with HIP +target_link_libraries(stream hip::host) diff --git a/projects/hip/samples/2_Cookbook/8_peer2peer/CMakeLists.txt b/projects/hip/samples/2_Cookbook/8_peer2peer/CMakeLists.txt new file mode 100644 index 0000000000..7c38373911 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/8_peer2peer/CMakeLists.txt @@ -0,0 +1,19 @@ +project(peer2peer) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(peer2peer peer2peer.cpp) + +# Link with HIP +target_link_libraries(peer2peer hip::host) diff --git a/projects/hip/samples/2_Cookbook/9_unroll/CMakeLists.txt b/projects/hip/samples/2_Cookbook/9_unroll/CMakeLists.txt new file mode 100644 index 0000000000..fc1b740e33 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/9_unroll/CMakeLists.txt @@ -0,0 +1,19 @@ +project(unroll) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(unroll unroll.cpp) + +# Link with HIP +target_link_libraries(unroll hip::host) diff --git a/projects/hip/samples/README.md b/projects/hip/samples/README.md new file mode 100644 index 0000000000..739045382e --- /dev/null +++ b/projects/hip/samples/README.md @@ -0,0 +1,27 @@ +Build procedure + +We provide Makefile and CMakeLists.txt to build the samples seperately. + +1.Makefile supports shared lib of hip-rocclr runtime and nvcc. + +To build a sample, just type in sample folder, + +make + + + +2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime. + +To build a sample, type in sample folder, + +mkdir build (if build folder is missing) + +cd build + +cmake .. + +make + +If you want debug version, follow, + +cmake -DCMAKE_BUILD_TYPE=Debug .. \ No newline at end of file diff --git a/projects/hip/tests/hit/HIT.cmake b/projects/hip/tests/hit/HIT.cmake old mode 100644 new mode 100755 index 1677d93a20..839b90befb --- a/projects/hip/tests/hit/HIT.cmake +++ b/projects/hip/tests/hit/HIT.cmake @@ -303,6 +303,7 @@ macro(MAKE_TEST _config exe) add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) endif() set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR}) + set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR}) endmacro() macro(MAKE_NAMED_TEST _config exe testname) diff --git a/projects/hip/tests/performance/compute/hipPerfMandelbrot.cpp b/projects/hip/tests/performance/compute/hipPerfMandelbrot.cpp new file mode 100644 index 0000000000..c4234d8c37 --- /dev/null +++ b/projects/hip/tests/performance/compute/hipPerfMandelbrot.cpp @@ -0,0 +1,747 @@ +/* + Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" +#include +#include +#include +#include +#include + +typedef struct { + double x; + double y; + double width; +} coordRec; + +coordRec coords[] = { + {0.0, 0.0, 4.0}, // Whole set + {0.0, 0.0, 0.00001}, // All black + {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +template +__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*i); + float y0 = (float)(yPos + yStep*j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + + out[tid] = iter; +}; + +template +__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, + T yPos, T xStep, T yStep, uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*(float)i); + float y0 = (float)(yPos + yStep*(float)j); + + float x = x0; + float y = y0; + +#define FAST + uint iter = 0; + float tmp; + int stay; + int ccount = 0; + stay = (x*x+y*y) <= 4.0; + float savx = x; + float savy = y; +#ifdef FAST + for (iter = 0; (iter < maxIter); iter+=16) { +#else + for (iter = 0; stay && (iter < maxIter); iter+=16) { +#endif + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + stay = (x*x+y*y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay*16; +#ifdef FAST + if (!stay) + break; +#endif + } + // Handle remainder + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while (stay && iter); + } + + + out[tid] = (uint)ccount; + +}; + + +template +__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + double x0 = (double)(xPos + xStep*i); + double y0 = (double)(yPos + yStep*j); + + double x = x0; + double y = y0; + + uint iter = 0; + double tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + out[tid] = iter; +}; + + +template +__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, + T yPos, T xStep, T yStep, uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + + int i = tid % width; + int j = tid / width; + double x0 = (double)(xPos + xStep*(double)i); + double y0 = (double)(yPos + yStep*(double)j); + + double x = x0; + double y = y0; + +#define FAST + uint iter = 0; + double tmp; + int stay; + int ccount = 0; + stay = (x*x+y*y) <= 4.0; + double savx = x; + double savy = y; +#ifdef FAST + for (iter = 0; (iter < maxIter); iter+=16) +#else + for (iter = 0; stay && (iter < maxIter); iter+=16) +#endif + { + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + stay = (x*x+y*y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay*16; +#ifdef FAST + if (!stay) + break; +#endif + } + // Handle remainder + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } + while (stay && iter); + + } + out[tid] = (uint)ccount; +}; + +static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; + +// Expected results for each kernel run at each coord +unsigned long long expectedIters[] = { + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, + 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, + 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + +class hipPerfMandelBrot { + public: + hipPerfMandelBrot(); + ~hipPerfMandelBrot(); + + void setNumKernels(unsigned int num) { + numKernels = num; + } + + unsigned int getNumKernels() { + return numKernels; + } + + void setNumStreams(unsigned int num) { + numStreams = num; + } + unsigned int getNumStreams() { + return numStreams; + } + + void open(int deviceID); + void run(unsigned int testCase, unsigned int deviceId); + void printResults(void); + + // array of funtion pointers + typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + // Wrappers + void float_mad(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt); + + void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt); + + void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + hipStream_t streams[2]; + + private: + void setData(void *ptr, unsigned int value); + void checkData(uint *ptr); + + unsigned int numKernels; + unsigned int numStreams; + + std::map> results; + unsigned int width_; + unsigned int bufSize; + unsigned int maxIter; + unsigned int coordIdx; + volatile unsigned long long totalIters = 0; + int numCUs; + static const unsigned int numLoops = 10; +}; + + +hipPerfMandelBrot::hipPerfMandelBrot() {} + +hipPerfMandelBrot::~hipPerfMandelBrot() {} + +void hipPerfMandelBrot::open(int deviceId) { + + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + std::cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return; + } + + + HIPCHECK(hipSetDevice(deviceId)); + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, deviceId)); + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId + << std::endl; + + numCUs = props.multiProcessorCount; +} + + +void hipPerfMandelBrot::printResults() { + + int numkernels = getNumKernels(); + int numStreams = getNumStreams(); + + std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " + << numStreams << " streams (s)" << std::endl; + + std::map>:: iterator itr; + for (itr = results.begin(); itr != results.end(); itr++) { + std::cout << "\n" << std::setw(20) << itr->first << " "; + for(auto i : results[itr->first]) { + std::cout << std::setw(10) << i << " "; + } + } + results.clear(); + + std::cout << std::endl; +} + + +// Wrappers for the kernel launches +void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, + maxIter); + + +} + + +void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { + + unsigned int numStreams = getNumStreams(); + + funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, + &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; + + // Maximum iteration count + maxIter = 32768; + + uint * hPtr[numKernels]; + uint * dPtr[numKernels]; + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + for (uint i = 0; i < numStreams; i++) { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + + // Allocate memory on the host and device + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) + } + + + // Prepare kernel launch parameters + int threads = (bufSize/sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads/threads_per_block) + (threads % threads_per_block); + + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory asynchronously and concurrently from host to device + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + } + + // Synchronize to make sure all the copies are completed + HIPCHECK(hipStreamSynchronize(0)); + + int kernelIdx; + if(testCase == 0 || testCase == 5 || testCase == 10) { + kernelIdx = 0; + } + + else if(testCase == 1 || testCase == 6 || testCase == 11) { + kernelIdx = 1; + } + else if(testCase == 2 || testCase == 7 || testCase == 12) { + kernelIdx = 2; + } + else if(testCase == 3 || testCase == 8 || testCase == 13){ + kernelIdx = 3; + } + + + double totalTime = 0.0; + + for (unsigned int k = 0; k < numLoops; k++) { + + coordIdx = testCase % numCoords; + + if ((testCase == 0 || testCase == 1 || testCase == 2 || + testCase == 5 || testCase == 6 || testCase == 7 || + testCase == 10 || testCase == 11 || testCase == 12)) { + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + + } + + + else { + double xStep = coords[coordIdx].width / (double)width_; + double yStep = -coords[coordIdx].width / (double)width_; + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } + + + } + + // Copy data back from device to the host + for(uint i = 0; i < numKernels; i++) { + HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); + } + + + for(uint i = 0; i < numKernels; i++) { + checkData(hPtr[i]); + + int j =0; + while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { + j++; + } + + if(j==30) { + std::cout << "Incorrect iteration count detected. "; + } + + } + + + // Compute GFLOPS. There are 7 FLOPs per iteration + double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) / + (totalTime / (double)numLoops); + + + std::vector kernelName = {"float", "float_unroll", + "double", "double_unroll"}; + + // Print results except for Warm-up kernel + if(testCase!=100) { + results[kernelName[testCase % 4]].push_back(perf); + } + + + for(uint i = 0 ; i < numStreams; i++) { + HIPCHECK(hipStreamDestroy(streams[i])); + } + + + // Free host and device memory + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } + + +} + + +void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < width_ * width_; i++) { + ptr2[i] = value; + } +} + + +void hipPerfMandelBrot::checkData(uint *ptr) { + totalIters = 0; + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += ptr[i]; + } +} + + +int main(int argc, char* argv[]) { + hipPerfMandelBrot mandelbrotCompute; + int deviceId = 0; + + mandelbrotCompute.open(deviceId); + + for (unsigned int testCase = 0; testCase < 3; testCase++) { + + + switch (testCase) { + + + case 0: { + // Warmup-kernel - default stream executes serially + mandelbrotCompute.setNumStreams(1); + mandelbrotCompute.setNumKernels(1); + mandelbrotCompute.run(100/*Random number*/, deviceId); + break; + } + + + case 1: { + // run all - sync + int i = 0; + do { + mandelbrotCompute.setNumStreams(1); + mandelbrotCompute.setNumKernels(1); + mandelbrotCompute.run(i, deviceId); + i++; + }while(i < 12); + mandelbrotCompute.printResults(); + + break; + } + + + case 2: { + // run all - async + int i = 0; + do { + mandelbrotCompute.setNumStreams(2); + mandelbrotCompute.setNumKernels(2); + mandelbrotCompute.run(i, deviceId); + i++; + }while(i < 12); + mandelbrotCompute.printResults(); + + break; + + } + + + default: { + break; + } + + + } + + + + } + + + passed(); +} diff --git a/projects/hip/tests/performance/stream/hipPerfDeviceConcurrency.cpp b/projects/hip/tests/performance/stream/hipPerfDeviceConcurrency.cpp new file mode 100644 index 0000000000..7d6699a9a2 --- /dev/null +++ b/projects/hip/tests/performance/stream/hipPerfDeviceConcurrency.cpp @@ -0,0 +1,289 @@ +/* + Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +typedef struct { + double x; + double y; + double width; +} coordRec; + +static coordRec coords[] = { + {0.0, 0.0, 0.00001}, // All black +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter) { + + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*i); + float y0 = (float)(yPos + yStep*j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + + out[tid] = iter; +}; + +class hipPerfDeviceConcurrency { + public: + hipPerfDeviceConcurrency(); + ~hipPerfDeviceConcurrency(); + + void setNumGpus(unsigned int num) { + numDevices = num; + } + unsigned int getNumGpus() { + return numDevices; + } + + void open(void); + void close(void); + void run(unsigned int testCase, int numGpus); + + private: + void setData(void *ptr, unsigned int value); + void checkData(uint *ptr); + + unsigned int numDevices; + unsigned int width_; + unsigned int bufSize; + unsigned int coordIdx; + unsigned long long totalIters = 0; +}; + + +hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} + +hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} + +void hipPerfDeviceConcurrency::open(void) { + + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + setNumGpus(nGpu); + if (nGpu < 1) { + std::cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + } + + +} + + +void hipPerfDeviceConcurrency::close() { +} + +void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { + + + static int deviceId; + uint * hPtr[numGpus]; + uint * dPtr[numGpus]; + hipStream_t streams[numGpus]; + int numCUs[numGpus]; + unsigned int maxIter[numGpus]; + unsigned long long expectedIters[numGpus]; + + int threads, threads_per_block, blocks; + float xStep, yStep, xPos, yPos; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, i)); + + if (testCase != 0) { + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device ID: " + << i << std::endl; + } + + numCUs[i] = props.multiProcessorCount; + int clkFrequency = 0; + HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); + + clkFrequency =(unsigned int)clkFrequency/1000; + + // Maximum iteration count + // maxIter = 8388608 * (engine_clock / 1000).serial execution + maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); + maxIter[i] = (maxIter[i] + 15) & ~15; + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + HIPCHECK(hipStreamCreate(&streams[i])); + + // Allocate memory on the host and device + HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) + + // Prepare kernel launch parameters + threads = (bufSize/sizeof(uint)); + threads_per_block = 64; + blocks = (threads/threads_per_block) + (threads % threads_per_block); + + coordIdx = testCase % numCoords; + xStep = (float)(coords[coordIdx].width / (double)width_); + yStep = (float)(-coords[coordIdx].width / (double)width_); + xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory from host to device + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + + } + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], + dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); + + } + + for(int i = 0; i < numGpus; i++) { + HIPCHECK(hipStreamSynchronize(0)); + } + + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + HIPCHECK(hipSetDevice(deviceId)); + + // Copy data back from device to the host + HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); + + checkData(hPtr[i]); + expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; + + if (testCase != 0) { + checkData(hPtr[i]); + if(totalIters != expectedIters[i]) { + std::cout << "Incorrect iteration count detected" << std::endl; + } + } + + + HIPCHECK(hipStreamDestroy(streams[i])); + + // Free host and device memory + HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } + + if (testCase != 0) { + std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " + << all_kernel_time.count() << " (s) " << '\n' << std::endl; + } + + if(testCase == 0) { + deviceId++; + } + + +} + + +void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < width_ * width_ ; i++) { + ptr2[i] = value; + } +} + + +void hipPerfDeviceConcurrency::checkData(uint *ptr) { + totalIters = 0; + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += ptr[i]; + } +} + + +int main(int argc, char* argv[]) { + hipPerfDeviceConcurrency deviceConcurrency; + + deviceConcurrency.open(); + + int nGpu = deviceConcurrency.getNumGpus(); + + // testCase = 0 refers to warmup kernel run + int testCase = 0; + + for (int i = 0; i < nGpu; i++) { + // Warm-up kernel on all devices + deviceConcurrency.run(testCase, 1); + } + + // Time for kernel on 1 device + deviceConcurrency.run(++testCase, 1); + + // Time for kernel on all available devices + deviceConcurrency.run(++testCase, nGpu); + + passed(); +} diff --git a/projects/hip/tests/src/kernel/hipShflTests.cpp b/projects/hip/tests/src/kernel/hipShflTests.cpp index 9b1cc73248..06b6a90b83 100644 --- a/projects/hip/tests/src/kernel/hipShflTests.cpp +++ b/projects/hip/tests/src/kernel/hipShflTests.cpp @@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) } } +void getFactor(int& fact) { fact = 101; } +void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } +void getFactor(float& fact) { fact = 2.5; } +void getFactor(double& fact) { fact = 2.5; } +void getFactor(long& fact) { fact = 202; } +void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } +void getFactor(long long& fact) { fact = 303; } +void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } + template void runTest() { T* Matrix; @@ -77,8 +86,10 @@ void runTest() { cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T)); // initialize the input data + T factor; + getFactor(factor); for (i = 0; i < NUM; i++) { - Matrix[i] = (T)i * 10l; + Matrix[i] = (T)i + factor; } // allocate the memory on the device side @@ -124,7 +135,11 @@ void runTest() { int main() { runTest(); runTest(); + runTest(); runTest(); runTest(); + runTest(); + runTest(); + runTest(); passed(); } diff --git a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp index 553087ce45..cd3900aee5 100644 --- a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp +++ b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp @@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) { a[threadIdx.x] = val; } +template +__global__ void shflXorSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size/2; i > 0; i /= 2) + val += __shfl_xor(val, i, size); + a[threadIdx.x] = val; +} + +void getFactor(int& fact) { fact = 101; } +void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } +void getFactor(float& fact) { fact = 2.5; } +void getFactor(double& fact) { fact = 2.5; } +void getFactor(long& fact) { fact = 202; } +void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } +void getFactor(long long& fact) { fact = 303; } +void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } + template void runTestShflUp() { const int size = 32; T a[size]; T cpuSum = 0; + T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i; + a[i] = i + factor; cpuSum += a[i]; } T* d_a; @@ -73,8 +91,9 @@ void runTestShflDown() { const int size = 32; T a[size]; T cpuSum = 0; + T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i; + a[i] = i + factor; cpuSum += a[i]; } T* d_a; @@ -84,19 +103,58 @@ void runTestShflDown() { hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); if (a[0] != cpuSum) { hipFree(d_a); - failed("Shfl Up Sum did not match."); + failed("Shfl Down Sum did not match."); + } + hipFree(d_a); +} + +template +void runTestShflXor() { + const int size = 32; + T a[size]; + T cpuSum = 0; + T factor; getFactor(factor); + for (int i = 0; i < size; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + T* d_a; + hipMalloc(&d_a, sizeof(T) * size); + hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault); + hipLaunchKernelGGL(shflXorSum, 1, size, 0, 0, d_a, size); + hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); + if (a[0] != cpuSum) { + hipFree(d_a); + failed("Shfl Xor Sum did not match."); } hipFree(d_a); } int main() { runTestShflUp(); runTestShflUp(); + runTestShflUp(); runTestShflUp(); runTestShflUp(); + runTestShflUp(); + runTestShflUp(); + runTestShflUp(); runTestShflDown(); runTestShflDown(); + runTestShflDown(); runTestShflDown(); runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); passed(); } diff --git a/projects/hip/tests/src/p2p/hipPeerToPeer_simple.cpp b/projects/hip/tests/src/p2p/hipPeerToPeer_simple.cpp old mode 100644 new mode 100755 index 9f0982f353..13779694e2 --- a/projects/hip/tests/src/p2p/hipPeerToPeer_simple.cpp +++ b/projects/hip/tests/src/p2p/hipPeerToPeer_simple.cpp @@ -395,6 +395,9 @@ int main(int argc, char* argv[]) { if (gpuCount < 2) { printf("P2P application requires atleast 2 gpu devices\n"); + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } } else { if (p_tests & 0x100) { testPeerHostToDevice(false /*useAsyncCopy*/); diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp new file mode 100644 index 0000000000..f073d7f72e --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp @@ -0,0 +1,280 @@ +/* + Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ +// Test Description: +/*The general idea of the application is to test how Cooperative Groups kernel +launches work when launching too many warps to the target device. This test +first queries the nominal warp size of the target device. It then walks through +block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For +each of these, it queries the maximum number of blocks that can fit in each SM. +It then queries the number of SMs on the target device. This will yield a +calculation for the maximum number of blocks that can be co-scheduled on this +device. + +The Cooperative Groups API says that users should not launch more than this +many warps (or blocks, etc.) to the target device. This test first tires to +launch 2x as many blcoks, to confirm that the runtime prevents such a launch +by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`). + +It then ensures that trying to launch too large of a kernel invocation does +not break the GPU by launching a kernel with exactly the maximum number of +blocks. + +Finally, we run the same test for a block size that is larger than the maximum +allowed by the device, to ensure that this case is properly detected by the +runtime and that nothing breaks.*/ + + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + + +#include +#include +#include "test_common.h" + + +static inline void hipCheckAndFail(hipError_t errval, + const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != hipSuccess) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + failed(""); + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + std::cerr << " Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + failed(""); + } +} +#define hipCheckErr(errval) \ + do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) + +static inline bool hipCheckExpected(hipError_t errval, + hipError_t expected_err, const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != expected_err) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + return false; + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + std::cerr << " Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + return false; + } + return true; +} + +static bool cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return false; + } + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return false; + } + return true; +} + +__global__ void test_kernel(long long *array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += clock64(); +} + +int main(int argc, char** argv) { + hipError_t err; + int device_num, FailFlag = 0; + // Alocate the host input buffer, and two device-focused buffers that we + // will use for our test. + unsigned int *dev_array[2]; + HIPCHECK(hipGetDeviceCount(&device_num)); + for (int dev = 0; dev < device_num; ++dev) { + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + HIPCHECK(hipSetDevice(dev)); + if (!cooperative_groups_support(dev)) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + + /*************************************************************************/ + /* Create the streams we will use in this test. **************************/ + hipStream_t streams[2]; + for (int i = 0; i < 2; i++) { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + /*************************************************************************/ + /* We will try to launch more waves than the GPU can fit. ***************/ + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + int max_num_threads = device_properties.maxThreadsPerBlock; + + // Check single-thread block, all numbers of warps, then too-large block + for (int block_size = 0; block_size <= (max_num_threads + warp_size); + block_size += warp_size) { + if (block_size == 0) { + block_size = 1; + } + int max_blocks_per_sm; + // Calculate the device occupancy to know how many blocks can be run. + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &max_blocks_per_sm, test_kernel, block_size, 0, + hipOccupancyDefault)); + + if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) { + std::cerr << "ERROR! Occupancy API indicated that we can have >0 "; + std::cerr << "blocks in a kernel when the block size is too large "; + std::cerr << "to work on the device." << std::endl; + std::cerr << "This is incorrect, and could possibly lead users "; + std::cerr << "to try to launch kernels that will fail." << std::endl; + //failed(""); + FailFlag = 1; + break; + } + + int desired_blocks = max_blocks_per_sm * num_sms; + bool expect_fail = false; + if (desired_blocks == 0) { + desired_blocks = 1; + expect_fail = true; + } + + /**********************************************************************/ + /* Set up data to pass into the kernel ********************************/ + + for (int i = 0; i < 2; i++) { + int test_size; + // Case where we expect to fail at launch. + if (i == 0) { + test_size = 2 * desired_blocks; + } else { + test_size = desired_blocks; + } + HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), + test_size * block_size * sizeof(long long))); + HIPCHECK(hipMemsetAsync(dev_array[i], 0, + test_size * block_size * sizeof(long long), + streams[i])); + } + + HIPCHECK(hipDeviceSynchronize()); + + /***********************************************************************/ + /* Launch the kernels **************************************************/ + void *coop_params[2][1]; + for (int i = 0; i < 2; i++) { + coop_params[i][0] = reinterpret_cast(&dev_array[i]); + } + + err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + 2 * desired_blocks, block_size, + coop_params[0], 0, streams[0]); + + hipError_t expect_to_see; + if (expect_fail) { + expect_to_see = hipErrorInvalidConfiguration; + } else { + expect_to_see = hipErrorCooperativeLaunchTooLarge; + } + if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) { + std::cerr << "ERROR! Tried to launch a cooperative kernel with "; + std::cerr << "too many warps." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << hipGetErrorString(expect_to_see); + std::cerr << " (" << expect_to_see << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + break; + } + + HIPCHECK(hipDeviceSynchronize()); + err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, block_size, + coop_params[1], 0, streams[1]); + + if (expect_fail) { + expect_to_see = hipErrorInvalidConfiguration; + } else { + expect_to_see = hipSuccess; + } + if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) { + std::cerr << "ERROR! Tried to launch a cooperative kernel "; + std::cerr << "with a normal size, but a block size of "; + std::cerr << desired_blocks << std::endl; + std::cerr << "This SHOULD have returned "; + std::cerr << hipGetErrorString(expect_to_see); + std::cerr << " (" << expect_to_see << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + break; + } + + HIPCHECK(hipDeviceSynchronize()); + + if (block_size == 1) { + block_size = 0; + } + for (int m = 0; m < 2; ++m) { + HIPCHECK(hipFree(dev_array[m])); + } + } + for (int m = 0; m < 2; ++m) { + HIPCHECK(hipStreamDestroy(streams[m])); + } + if (FailFlag == 1) { + for (int m = 0; m < 2; ++m) { + HIPCHECK(hipFree(dev_array[m])); + } + failed(""); + } + } + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp new file mode 100644 index 0000000000..c9adc03b24 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp @@ -0,0 +1,283 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/* +The general idea of the application is to test how Cooperative Groups kernel +launches to a stream interact with other kernels being launched to different +streams. + +For example: the HIP runtime will force cooperative kernel launches to run +serially, even if they are launched to different streams. However, +cooperative kernel launches can run in parallel with regular kernels that +are launched to other streams. This limitation is so that the cooperative +kernels do not conflict with one another for resources and potentially +deadlock the system. + +As such, this benchmark tests three situations: + + 1. Launching a cooperative kernel by itself to stream[0] + 2. Launching two cooperative kernels in parallel to stream[0] and stream[1] + 3. Launching two cooperative kernels in parallel to stream[0] and stream[1] + and launching a third non-cooperative kernel to stream[2] + +We time how long it takes to run each of these benchmarks and print it as +the output of the benchmark. The kernels themselves are just useless time- +wasting code so that the kernel takes a meaningful amount of time on the +GPU before it exits. We only launch a single wavefront for each kernel, so +any serialization should not be because of GPU occupancy concerns. + +If test #2 takes roughly twice as long as #1, that implies that cooperative +kernels are properly serialized with each other by the runtime. + +If test #3 takes the same amount of time as test #2, that implies that +regular kernels can properly run in parallel with cooperative kernels. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include "test_common.h" + +static inline void hipCheckAndFail(hipError_t errval, + const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != hipSuccess) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << "Location: " << file << ":" << line << std::endl; + failed(""); + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << "Location: " << file << ":" << line << std::endl; + std::cerr << "Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + failed(""); + } +} +#define hipCheckErr(errval) \ + do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) + +static int cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +__global__ void test_kernel(uint32_t loops, unsigned long long *array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long start_clock = clock64(); + while (clock64() < (start_clock+1000000)) {} + array[rank] += clock64(); + } +} + +int main(int argc, char** argv) { + hipError_t err; + /*************************************************************************/ + int device_num = 0, loops = 1000, FailFlag = 0; + /* Create the streams we will use in this test. **************************/ + hipStream_t streams[3]; + // Alocate the host input buffer, and two device-focused buffers that we + // will use for our test. + unsigned long long *dev_array[3]; + HIPCHECK(hipGetDeviceCount(&device_num)); + for (int dev = 0; dev < device_num; ++dev) { + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + HIPCHECK(hipSetDevice(dev)); + if (!cooperative_groups_support(dev)) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + + /*************************************************************************/ + /* We will launch enough waves to fill up all of the GPU *****************/ + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + int desired_blocks = 1; + std::cout << "Device: " << dev << std::endl; + std::cout << "Device name: " << device_properties.name << std::endl; + + int max_blocks_per_sm; + // Calculate the device occupancy to know how many blocks can be run. + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, + test_kernel, + warp_size, 0)); + + if (desired_blocks > max_blocks_per_sm * num_sms) { + std::cerr << "The requested number of blocks will not fit on the GPU"; + std::cerr << std::endl; + std::cerr << "You requested " << desired_blocks << " but we can only "; + std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl; + failed(""); + } + + /*************************************************************************/ + for (int i = 0; i < 3; i++) { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + /*************************************************************************/ + /* Set up data to pass into the kernel ***********************************/ + + for (int i = 0; i < 3; i++) { + HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), + warp_size * sizeof(long long))); + HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), + streams[i])); + } + + HIPCHECK(hipDeviceSynchronize()); + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + void *coop_params[3][2]; + for (int i = 0; i < 3; i++) { + coop_params[i][0] = reinterpret_cast(&loops); + coop_params[i][1] = reinterpret_cast(&dev_array[i]); + } + + std::cout << "Launching a single cooperative kernel..." << std::endl; + auto single_start = std::chrono::system_clock::now(); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, warp_size, + coop_params[0], 0, streams[0])); + + HIPCHECK(hipDeviceSynchronize()); + auto single_end = std::chrono::system_clock::now(); + std::cout << "Launching 2 cooperative kernels to different streams..."; + std::cout << std::endl; + + auto double_start = std::chrono::system_clock::now(); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, warp_size, + coop_params[0], 0, streams[0])); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, warp_size, + coop_params[1], 0, streams[1])); + + HIPCHECK(hipDeviceSynchronize()); + auto double_end = std::chrono::system_clock::now(); + std::cout << "Launching 2 cooperative kernels and 1 normal kernel..."; + std::cout << std::endl; + + auto triple_start = std::chrono::system_clock::now(); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, warp_size, + coop_params[0], 0, streams[0])); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + desired_blocks, warp_size, + coop_params[1], 0, streams[1])); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), + 0, streams[2], loops, dev_array[2]); + err = hipGetLastError(); + hipCheckErr(err); + + HIPCHECK(hipDeviceSynchronize()); + auto triple_end = std::chrono::system_clock::now(); + std::chrono::duration single_kernel_time = + (single_end - single_start); + std::chrono::duration double_kernel_time = + (double_end - double_start); + std::chrono::duration triple_kernel_time = + (triple_end - triple_start); + + std::cout << "A single kernel took:" << std::endl; + std::cout << " " << single_kernel_time.count(); + std::cout << " seconds" << std::endl; + std::cout << std::endl; + std::cout << "Two cooperative kernels that could run together took:"; + std::cout << std::endl; + std::cout << " " << double_kernel_time.count(); + std::cout << " seconds" << std::endl; + std::cout << std::endl; + std::cout << "Two coop kernels and a third regular kernel took:"; + std::cout << std::endl << " "; + std::cout << triple_kernel_time.count(); + std::cout << " seconds" << std::endl; + + std::cout << "Testing whether these times make sense.." << std::endl; + // Test that two cooperative kernels is roughly twice as long as one + if (double_kernel_time < 1.8 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Two cooperative kernels launched at the same "; + std::cerr << "time did not take roughly twice as long as a single "; + std::cerr << "cooperative kernel." << std::endl; + std::cerr << "Were they truly serialized?" << std::endl; + FailFlag = 1; + break; + } + + // Test that the three kernels together took roughly as long as two + // cooperative kernels. + if (triple_kernel_time > 1.1 * double_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Launching a normal kernel in parallel with two "; + std::cerr << "back-to-back cooperative kernels still ended up taking "; + std::cerr << "more than 10% longer than the two cooperative kernels "; + std::cerr << "alone." << std::endl; + std::cerr << "Is the normal kernel being serialized with the "; + std::cerr << "cooperative kernels on different streams?" << std::endl; + FailFlag = 1; + break; + } + for (int k = 0; k < 3; ++k) { + HIPCHECK(hipFree(dev_array[k])); + HIPCHECK(hipStreamDestroy(streams[k])); + } + } + if (FailFlag == 1) { + for (int k = 0; k < 3; ++k) { + HIPCHECK(hipFree(dev_array[k])); + HIPCHECK(hipStreamDestroy(streams[k])); + } + failed(""); + } + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp new file mode 100644 index 0000000000..46ad7ea7a4 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp @@ -0,0 +1,303 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/*The general idea of the application is to create a buffer of width N. N is a +command line parameter, and the user will need to make sure that we can fit +two buffers of N unsigned integers onto the target GPU at the same time. + +We then launch a fixed number of warps to the GPU. This number is calculated +to fill the GPU with as many warps as can simultaneously run on the GPU. +The threads in these warps then walk over two arrays. First, values from +A[offset] are added into B[offset]. After all of A is added into all of B +in this element-wise manner, all of the waves barrier with one another. + +After the barrier, the waves start adding values from B[mirror_offset] into +A[offset]. Mirror offset means that the wave that is writing into A[7] is +reading from B[7 before the last value]. This was probably written by a +different thread before the barrier. + +After going through this loop a certain number of times, the kernel ends and +we read the arrays back out and recalculate this algorithm serially on the +CPU. We compare the serial version to the version that has inter-thread data +sharing and barriers and ensure they result in the same answer. + +If they do have the same answer, then we can pretty confidently say that +writing from thread X and then hitting a barrier allows thread Y to see the +values.*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ +#include +#include +#include "test_common.h" + +static inline void hipCheckAndFail(hipError_t errval, + const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != hipSuccess) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + exit(errval); + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + std::cerr << " Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + failed(""); + } +} +#define hipCheckErr(errval)\ + do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) + +static int cooperative_groups_support(int device_id) { + hipError_t err; + + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +static int verify_coop_arrays(unsigned int loops, unsigned int *host_input, + unsigned int *first_array, + unsigned int *second_array, + unsigned int array_len) { + unsigned int *host_first_array = host_input; + unsigned int *host_second_array = (unsigned int*)calloc(array_len, + sizeof(int)); + + for (int i = 0; i < loops; i++) { + for (int offset = 0; offset < array_len; offset++) { + host_second_array[offset] += host_first_array[offset]; + } + + for (int offset = 0; offset < array_len; offset++) { + unsigned int swizzle_offset = array_len - offset - 1; + host_first_array[offset] += host_second_array[swizzle_offset]; + } + } + + for (int i = 0; i < array_len; i++) { + if (host_first_array[i] != first_array[i]) { + std::cerr << "Test failure!" << std::endl; + std::cerr << " host_first_array[" << i << "] contains the "; + std::cerr << "value " << host_first_array[i] << std::endl; + std::cerr << " GPU first_array[" << i << "] contains the "; + std::cerr << "value " << first_array[i] << std::endl; + return -1; + } + if (host_second_array[i] != second_array[i]) { + std::cerr << "Test failure!" << std::endl; + std::cerr << " host_second_array[" << i << "] contains the "; + std::cerr << "value " << host_second_array[i] << std::endl; + std::cerr << " GPU second_array[" << i << "] contains the "; + std::cerr << "value " << second_array[i] << std::endl; + return -1; + } + } + + std::cout << "Coop test appears to work properly!" << std::endl; + free(host_second_array); + return 0; +} + +__global__ void +coop_kernel(unsigned int *first_array, unsigned int *second_array, + unsigned int loops, unsigned int array_len) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + unsigned int rank = grid.thread_rank(); + unsigned int grid_size = grid.size(); + + for (int i = 0; i < loops; i++) { + // The goal of this loop is to directly add in values from + // array one into array two, on a per-wave basis. + for (int offset = rank; offset < array_len; offset += grid_size) { + second_array[offset] += first_array[offset]; + } + + grid.sync(); + + // The goal of this loop is to pull data the "mirror" lane in + // array two and add it back into array one. This causes inter- + // thread swizzling. + for (int offset = rank; offset < array_len; offset += grid_size) { + unsigned int swizzle_offset = array_len - offset - 1; + first_array[offset] += second_array[swizzle_offset]; + } + + grid.sync(); + } +} + +int main(int argc, char** argv) { + hipError_t err; + /*************************************************************************/ + /* Parse the command line parameters *************************************/ + // Arguments to pull out of the command line. + int device_num = 0, loops = 2, width = 4096, flag = 0; + HIPCHECK(hipGetDeviceCount(&device_num)); + for (int dev = 0; dev < device_num; ++dev) { + std::cout << "Device number: " << dev << std::endl; + std::cout << "Loops: " << loops << std::endl; + std::cout << "Width: " << width << std::endl; + + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + HIPCHECK(hipSetDevice(dev)); + + if (!cooperative_groups_support(dev)) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + + /*************************************************************************/ + /* We will launch enough waves to fill up all of the GPU *****************/ + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); + + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + + std::cout << "Device name: " << device_properties.name << std::endl; + std::cout << std::endl; + + // Calculate the device occupancy to know how many blocks can be run. + int max_blocks_per_sm; + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, + coop_kernel, + warp_size, 0)); + + int total_blocks = max_blocks_per_sm * num_sms; + + /*************************************************************************/ + /* Create the streams we will use in this test. **************************/ + hipStream_t streams[2]; + for (int i = 0; i < 2; i++) { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + /*************************************************************************/ + /* Set up data to pass into the kernel ***********************************/ + + // Alocate the host input buffer, and two device-focused buffers that we + // will use for our test. + unsigned int *input_buffer = (unsigned int*)calloc(width, + sizeof(unsigned int)); + for (int i = 0; i < width; i++) { + input_buffer[i] = i; + } + + unsigned int *first_dev_array; + HIPCHECK(hipMalloc(reinterpret_cast(&first_dev_array), + width * sizeof(unsigned int))); + + HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer, + width * sizeof(unsigned int), + hipMemcpyHostToDevice, streams[0])); + + unsigned int *second_dev_array; + HIPCHECK(hipMalloc(reinterpret_cast(&second_dev_array), + width * sizeof(unsigned int))); + HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int), + streams[0])); + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + std::cout << "Launching a cooperative kernel with " << total_blocks; + std::cout << " thread blocks, each with " << warp_size << " threads"; + std::cout << std::endl; + + void *coop_params[4]; + coop_params[0] = reinterpret_cast(&first_dev_array); + coop_params[1] = reinterpret_cast(&second_dev_array); + coop_params[2] = reinterpret_cast(&loops); + coop_params[3] = reinterpret_cast(&width); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(coop_kernel), + total_blocks, warp_size, coop_params, + 0, streams[0])); + + /*************************************************************************/ + /* Read back the buffers and print out their data ************************/ + unsigned int *first_array = (unsigned int*)calloc(width, + sizeof(unsigned int)); + unsigned int *second_array = (unsigned int*)calloc(width, + sizeof(unsigned int)); + HIPCHECK(hipMemcpyAsync(first_array, first_dev_array, + width * sizeof(unsigned int), + hipMemcpyDeviceToHost, streams[0])); + + HIPCHECK(hipMemcpyAsync(second_array, second_dev_array, + width * sizeof(unsigned int), + hipMemcpyDeviceToHost, streams[0])); + + std::cout << "Waiting for cooperative work to finish..." << std::endl; + std::cout << std::flush; + + HIPCHECK(hipStreamSynchronize(streams[0])); + + + int ret_val = 0; + + std::cout << "Attemping to verify buffers." << std::endl; + std::cout << std::flush; + ret_val = verify_coop_arrays(loops, input_buffer, first_array, + second_array, width); + if (!ret_val) { + std::cout << "It appears that inter-thread data sharing at "; + std::cout << "grid_group sync points works properly!" << std::endl; + } else { + flag = 1; + } + for (int k = 0; k < 2; ++k) { + HIPCHECK(hipStreamDestroy(streams[k])); + } + HIPCHECK(hipFree(first_dev_array)); + HIPCHECK(hipFree(second_dev_array)); + free(input_buffer); + free(first_array); + free(second_array); + } + if (!flag) { + passed(); + } else { + failed(""); + } +} diff --git a/projects/hip/tests/src/cg/hipCGGridGroupType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp old mode 100644 new mode 100755 similarity index 97% rename from projects/hip/tests/src/cg/hipCGGridGroupType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp index db45c10512..79f1cb1c38 --- a/projects/hip/tests/src/cg/hipCGGridGroupType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -139,7 +139,11 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - passed(); + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } return 0; } diff --git a/projects/hip/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp old mode 100644 new mode 100755 similarity index 97% rename from projects/hip/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp index 11562dfff6..7407f266dd --- a/projects/hip/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -139,7 +139,11 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - passed(); + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } return 0; } diff --git a/projects/hip/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp old mode 100644 new mode 100755 similarity index 97% rename from projects/hip/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp index 21f0348aec..cb9d8d7c53 --- a/projects/hip/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -139,7 +139,11 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - passed(); + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } return 0; } diff --git a/projects/hip/tests/src/cg/hipCGMultiGridGroupType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp old mode 100644 new mode 100755 similarity index 92% rename from projects/hip/tests/src/cg/hipCGMultiGridGroupType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp index 5a0529867a..02be0a521b --- a/projects/hip/tests/src/cg/hipCGMultiGridGroupType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -34,6 +34,8 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) +#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; @@ -193,15 +195,27 @@ static void test_cg_multi_grid_group_type(int blockSize) } // Validate results + int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { - //ASSERT_EQUAL(numGridsTestH[i][j], nGpu); - //ASSERT_EQUAL(gridRankTestH[i][j], i); + ASSERT_EQUAL(numGridsTestH[i][j], nGpu); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert (false && "Grid rank in multi-gpu setup should be unique"); + } + } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); diff --git a/projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp similarity index 83% rename from projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp index dae72f4cf8..0830e807c3 100644 --- a/projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -34,11 +34,14 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) +#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; static __global__ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, + int* gridRankTestD, int *thdRankTestD, int *isValidTestD, int *syncTestD, @@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, sizeTestD[gIdx] = tg.size(); // Test thread_rank + gridRankTestD[gIdx] = this_multi_grid().grid_rank(); thdRankTestD[gIdx] = tg.thread_rank(); // Test is_valid @@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) // Allocate host and device memory int nBytes = sizeof(int) * 2 * blockSize; int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; + int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; int *syncTestD[MaxGPUs], *syncResultD; @@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess); + ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess); + ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess); @@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) } // Launch Kernel - constexpr int NumKernelArgs = 5; + constexpr int NumKernelArgs = 6; hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; void* args[MaxGPUs * NumKernelArgs]; for (int i = 0; i < nGpu; i++) { ASSERT_EQUAL(hipSetDevice(i), hipSuccess); args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &thdRankTestD[i]; - args[i * NumKernelArgs + 2] = &isValidTestD[i]; - args[i * NumKernelArgs + 3] = &syncTestD[i]; - args[i * NumKernelArgs + 4] = &syncResultD; + args[i * NumKernelArgs + 1] = &gridRankTestD[i]; + args[i * NumKernelArgs + 2] = &thdRankTestD[i]; + args[i * NumKernelArgs + 3] = &isValidTestD[i]; + args[i * NumKernelArgs + 4] = &syncTestD[i]; + args[i * NumKernelArgs + 5] = &syncResultD; launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_base_type); launchParamsList[i].gridDim = 2; @@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); + ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost), + hipSuccess); ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost), @@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) } // Validate results + int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert (false && "Grid rank in multi-gpu setup should be unique"); + } + } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); @@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess); + ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess); @@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipFree(syncResultD), hipSuccess); ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess); + ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess); diff --git a/projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp similarity index 83% rename from projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp index 2f2f378931..5975ffa068 100644 --- a/projects/hip/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -34,11 +34,14 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) +#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; static __global__ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, + int* gridRankTestD, int *thdRankTestD, int *isValidTestD, int *syncTestD, @@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, sizeTestD[gIdx] = group_size(mg); // Test thread_rank api + gridRankTestD[gIdx] = this_multi_grid().grid_rank(); thdRankTestD[gIdx] = thread_rank(mg); // Test is_valid api @@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) // Allocate host and device memory int nBytes = sizeof(int) * 2 * blockSize; int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; + int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; int *syncTestD[MaxGPUs], *syncResultD; @@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess); + ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess); + ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess); @@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) } // Launch Kernel - constexpr int NumKernelArgs = 5; + constexpr int NumKernelArgs = 6; hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; void* args[MaxGPUs * NumKernelArgs]; for (int i = 0; i < nGpu; i++) { ASSERT_EQUAL(hipSetDevice(i), hipSuccess); args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &thdRankTestD[i]; - args[i * NumKernelArgs + 2] = &isValidTestD[i]; - args[i * NumKernelArgs + 3] = &syncTestD[i]; - args[i * NumKernelArgs + 4] = &syncResultD; + args[i * NumKernelArgs + 1] = &gridRankTestD[i]; + args[i * NumKernelArgs + 2] = &thdRankTestD[i]; + args[i * NumKernelArgs + 3] = &isValidTestD[i]; + args[i * NumKernelArgs + 4] = &syncTestD[i]; + args[i * NumKernelArgs + 5] = &syncResultD; launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_public_api); launchParamsList[i].gridDim = 2; @@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); + ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost), + hipSuccess); ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost), @@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) } // Validate results + int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert (false && "Grid rank in multi-gpu setup should be unique"); + } + } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); @@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess); + ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess); @@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipFree(syncResultD), hipSuccess); ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess); + ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess); diff --git a/projects/hip/tests/src/cg/hipCGThreadBlockType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp old mode 100644 new mode 100755 similarity index 95% rename from projects/hip/tests/src/cg/hipCGThreadBlockType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp index 4e1de9e44a..dccac38bf3 --- a/projects/hip/tests/src/cg/hipCGThreadBlockType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -166,6 +166,16 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + if (!deviceProperties.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } + return 0; + } + // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/projects/hip/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp old mode 100644 new mode 100755 similarity index 94% rename from projects/hip/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp index d4c9402268..b0a42782c0 --- a/projects/hip/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -135,6 +135,16 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + if (!deviceProperties.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } + return 0; + } + // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/projects/hip/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp old mode 100644 new mode 100755 similarity index 94% rename from projects/hip/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp index d13e58b059..e4a6a6e330 --- a/projects/hip/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -135,6 +135,16 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + if (!deviceProperties.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } + return 0; + } + // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/projects/hip/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp similarity index 98% rename from projects/hip/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp index 8e67044eb0..0e523f9d2e 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp @@ -20,7 +20,7 @@ THE SOFTWARE. // Simple test for hipLaunchCooperativeKernelMultiDevice API. /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp similarity index 94% rename from projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp rename to projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp index e0fcd4108b..6b1ba1c27a 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp @@ -22,15 +22,14 @@ THE SOFTWARE. // Simple test for hipLaunchCooperativeKernel API. /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" -#include "hip/hcc_detail/device_library_decls.h" -#include "hip/hcc_detail/hip_cooperative_groups.h" +#include "hip/hip_cooperative_groups.h" #include #include #include "test_common.h" @@ -129,7 +128,7 @@ int main() { params[3] = (void*)&dC; std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"; - HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream)); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream)); HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost)); diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp new file mode 100644 index 0000000000..b75725fed4 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp @@ -0,0 +1,568 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/*The general idea of the application is to test how Cooperative Groups kernel +launches work when launching too many warps to multiple target devices. This +tests the following failure modes for hipLaunchCooperativeKernelMultiDevice: + 1) Do not launch more warps to any device than can fit on that device + 2) All device targets for the multi-device launch function must be different + 3) All streams must be explicit (non-NULL) + 4) The kernels sent in must be identical between devices + 5) The grid and block sizes must be identical between devices + 6) The block dimensions must be non-zero + 7) The dynamic shared memory size must be identical between devices. + +This test ensures that the proper error conditions are returned, even if the +target kernel does not actually use any fo the cooperative groups features. + +Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm +do not have these constraints. As such, the test checks to see whether they +should fail or succeed and compares this to what actually happens. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + + +#include +#include +#include "test_common.h" + +static inline void hipCheckAndFail(hipError_t errval, + const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != hipSuccess) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + failed(""); + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + std::cerr << " Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + failed(""); + } +} +#define hipCheckErr(errval) \ + do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) + +static int cooperative_groups_support(int device_id) { + hipError_t err; + + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + int multi_gpu_cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, + hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); + + if (!multi_gpu_cooperative_attribute) { + std::cerr << "Multi-GPU cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + if (device_properties.cooperativeMultiDeviceLaunch == 0) { + std::cerr << "Multi-GPU cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +static int support_for_separate_kernels(int device_id) { + hipError_t err; + + int separate_kernel_supported; + HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported, + hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, + device_id)); + if (!separate_kernel_supported) { + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) { + return 0; + } + return 1; +} + +static int support_for_separate_grid_sizes(int device_id) { + hipError_t err; + int separate_sizes_supported; + HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, + hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, + device_id)); + if (!separate_sizes_supported) { + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) { + return 0; + } + return 1; +} + +static int support_for_separate_block_dims(int device_id) { + hipError_t err; + int separate_sizes_supported; + HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, + hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, + device_id)); + if (!separate_sizes_supported) { + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) { + return 0; + } + return 1; +} + +static int support_for_separate_shared_sizes(int device_id) { + hipError_t err; + int separate_sizes_supported; + HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, + hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, + device_id)); + if (!separate_sizes_supported) { + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) { + return 0; + } + return 1; +} + +__global__ void test_kernel(long long *array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += clock64(); +} + +__global__ void second_test_kernel(long long *array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += clock64(); +} + +int main(int argc, char** argv) { + hipError_t err; + /*************************************************************************/ + /* Parse the command line parameters *************************************/ + // Arguments to pull out of the command line. + int device_num, FailFlag = 0; + HIPCHECK(hipGetDeviceCount(&device_num)); + if (device_num < 2) { + std::cout << "This test requires atleast two gpus but the system has "; + std::cout << " only "<< device_num <(&good_dev_array[i]), + good_size)); + HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i])); + HIPCHECK(hipMalloc(reinterpret_cast(&bad_dev_array[i]), + bad_size)); + HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i])); + } + HIPCHECK(hipDeviceSynchronize()); + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + std::cout << "Launching a multi-GPU cooperative kernel with too many "; + std::cout << "warps..." << std::endl; + + void *dev_params[2][1]; + hipLaunchParams md_params[2]; + for (int i = 0; i < 2; i++) { + dev_params[i][0] = reinterpret_cast(&bad_dev_array[i]); + + md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].gridDim = 2 * desired_blocks; + md_params[i].blockDim = warp_size; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i]; + } + + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if (err != hipErrorCooperativeLaunchTooLarge) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with too many warps." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorCooperativeLaunchTooLarge ("; + std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } else { + std::cout << "\tProperly saw this return "; + std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl; + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel to the same "; + std::cout << "device twice..." << std::endl; + for (int i = 0; i < 2; i++) { + dev_params[i][0] = reinterpret_cast(&good_dev_array[i]); + md_params[i].gridDim = desired_blocks; + md_params[i].stream = streams[0]; + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if (err != hipErrorInvalidDevice) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "to the same device twice." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidDevice ("; + std::cerr << hipErrorInvalidDevice << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } else { + std::cout << "\tProperly saw this return "; + std::cout << "hipErrorInvalidDevice" << std::endl; + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel to the NULL "; + std::cout << "stream" << std::endl; + for (int i = 0; i < 2; i++) { + md_params[i].stream = NULL; + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if (err != hipErrorInvalidResourceHandle) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "to the NULL stream." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidResourceHandle ("; + std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } else { + std::cout << "\tProperly saw this return "; + std::cout << "hipErrorInvalidResourceHandle" << std::endl; + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with two "; + std::cout << "different kernels." << std::endl; + bool supports_sep_kernels = true; + for (int i = 0; i < 2; i++) { + md_params[i].stream = streams[i]; + if (!support_for_separate_kernels((dev + i))) { + supports_sep_kernels = false; + } + } + md_params[1].func = reinterpret_cast(second_test_kernel); + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if ((supports_sep_kernels && err != hipSuccess) || + (!supports_sep_kernels && err != hipErrorInvalidValue)) { + if (supports_sep_kernels) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different kernels." << std::endl; + std::cerr << "This SHOULD have succeeded with hipSuccess ("; + std::cerr << hipSuccess << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + } else { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different kernels." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidValue ("; + std::cerr << hipErrorInvalidValue << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + } + FailFlag = 1; + } else { + std::cout << "\tProperly saw this return "; + if (supports_sep_kernels) { + std::cout << "hipSuccess" << std::endl; + } else { + std::cout << "hipErrorInvalidValue" << std::endl; + } + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with two "; + std::cout << "different grid sizes." << std::endl; + bool supports_sep_sizes = true; + for (int i = 0; i < 2; i++) { + md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].gridDim = i+1; + if (!support_for_separate_grid_sizes((dev + i))) { + supports_sep_sizes = false; + } + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if ((supports_sep_sizes && err != hipSuccess) || + (!supports_sep_sizes && err == hipErrorInvalidValue)) { + if (supports_sep_sizes) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different grid sizes." << std::endl; + std::cerr << "This SHOULD have succeeded with hipSuccess ("; + std::cerr << hipSuccess << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + } else { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different grid sizes." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidValue ("; + std::cerr << hipErrorInvalidValue << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } + } else { + std::cout << "\tProperly saw this return "; + if (supports_sep_kernels) { + std::cout << "hipSuccess" << std::endl; + } else { + std::cout << "hipErrorInvalidValue" << std::endl; + } + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with two "; + std::cout << "different block dimensions." << std::endl; + supports_sep_sizes = true; + for (int i = 0; i < 2; i++) { + md_params[i].gridDim = desired_blocks; + md_params[i].blockDim = i+1; + if (!support_for_separate_block_dims((dev + i))) { + supports_sep_sizes = false; + } + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if ((supports_sep_sizes && err != hipSuccess) || + (!supports_sep_sizes && err == hipErrorInvalidValue)) { + if (supports_sep_sizes) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different block dimensions." << std::endl; + std::cerr << "This SHOULD have succeeded with hipSuccess ("; + std::cerr << hipSuccess << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + } else { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different block dimensions." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidValue ("; + std::cerr << hipErrorInvalidValue << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } + } else { + std::cout << "\tProperly saw this return "; + if (supports_sep_kernels) { + std::cout << "hipSuccess" << std::endl; + } else { + std::cout << "hipErrorInvalidValue" << std::endl; + } + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with block "; + std::cout << "dimensions of zero." << std::endl; + for (int i = 0; i < 2; i++) { + md_params[i].blockDim = 0; + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if (err != hipErrorInvalidConfiguration) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with block dimensions of zero." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidConfiguration ("; + std::cerr << hipErrorInvalidConfiguration << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } else { + std::cout << "\tProperly saw this return "; + std::cout << "hipErrorInvalidConfiguration" << std::endl; + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with two "; + std::cout << "different shared memory sizes." << std::endl; + supports_sep_sizes = true; + for (int i = 0; i < 2; i++) { + md_params[i].blockDim = warp_size; + md_params[i].sharedMem = i; + if (!support_for_separate_shared_sizes((dev + i))) { + supports_sep_sizes = false; + } + } + err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); + if ((supports_sep_sizes && err != hipSuccess) || + (!supports_sep_sizes && err == hipErrorInvalidValue)) { + if (supports_sep_sizes) { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different shared memory sizes." << std::endl; + std::cerr << "This SHOULD have succeeded with hipSuccess ("; + std::cerr << hipSuccess << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + } else { + std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; + std::cerr << "with two different shared memory sizes." << std::endl; + std::cerr << "This SHOULD have failed with the error "; + std::cerr << "hipErrorInvalidValue ("; + std::cerr << hipErrorInvalidValue << ")." << std::endl; + std::cerr << "Instead, the launch returned " << hipGetErrorName(err); + std::cerr << " (" << err << ")" << std::endl; + FailFlag = 1; + } + } else { + std::cout << "\tProperly saw this return "; + if (supports_sep_kernels) { + std::cout << "hipSuccess" << std::endl; + } else { + std::cout << "hipErrorInvalidValue" << std::endl; + } + } + HIPCHECK(hipDeviceSynchronize()); + + std::cout << "Launching a multi-GPU cooperative kernel with maximum "; + std::cout << "number of warps..." << std::endl; + for (int i = 0; i < 2; i++) { + md_params[i].sharedMem = 0; + } + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + std::cout << "\tProperly launched." << std::endl; + + HIPCHECK(hipDeviceSynchronize()); + for (int m = 0; m < 2; ++m) { + HIPCHECK(hipFree(good_dev_array[m])); + HIPCHECK(hipFree(bad_dev_array[m])); + HIPCHECK(hipStreamDestroy(streams[m])); + } + if (FailFlag == 1) { + break; + } + } + if (FailFlag == 1) { + failed(""); + } else { + passed(); + } +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp new file mode 100644 index 0000000000..a0275d7ba5 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp @@ -0,0 +1,581 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/*The general idea of the application is to test how multi-GPU Cooperative +Groups kernel launches to a stream interact with other things that may be +simultaneously running in the same streams. + +The HIP specification says that a multi-GPU cooperative launch will wait +until all of the streams it's using finish their work. Only then will the +cooperative kernel be launched to all of the devices. Then no other work +can take part in the any of the streams until all of the multi-GPU +cooperative work is done. + +However, there are flags that allow you to disable each of these +serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and +hipCooperativeLaunchMultiDeviceNoPostSync. + +As such, this benchmark tests the following five situations launching +to two GPUs (and thus two streams): + + 1. Normal multi-GPU cooperative kernel: + This should result in the following pattern: + Stream 0: Cooperative + Stream 1: Cooperative + 2. Regular kernel launches and multi-GPU cooperative kernel launches + with the default flags, resulting in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: --> Cooperative --> Regular + + 3. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off "pre-sync". This should allow a cooperative kernel + to launch even if work is already in a stream pointing to + another GPU. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: Cooperative --> Regular + + 4. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off "post-sync". This should allow a new kernel to enter + a GPU even if another GPU still has a cooperative kernel on it. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: --> Cooperative--> Regular + + 5. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off both pre- and post-sync. This should allow any of + the kernels to launch to their GPU regardless of the status of + other kernels in other multi-GPU stream groups. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: Cooperative --> Regular + +We time how long it takes to run each of these benchmarks and print it as +the output of the benchmark. The kernels themselves are just useless time- +wasting code so that the kernel takes a meaningful amount of time on the +GPU before it exits. We only launch a single wavefront for each kernel, so +any serialization should not be because of GPU occupancy concerns. + +If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that +cooperative kernels are serialized as expected. + +If test #5 takes roughly twice as long as #1, that implies that the +overlap-allowing flags work as expected. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include "test_common.h" + +static inline void hipCheckAndFail(hipError_t errval, + const char *file, int line) { + hipError_t last_err = hipGetLastError(); + if (errval != hipSuccess) { + std::cerr << "hip error: " << hipGetErrorString(errval); + std::cerr << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + failed(""); + } + if (last_err != errval) { + std::cerr << "Error: the return value of a function was not the same "; + std::cerr << "as the value returned by hipGetLastError()" << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + std::cerr << " Function returned: " << hipGetErrorString(errval); + std::cerr << " (" << errval << ")" << std::endl; + std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); + std::cerr << " (" << last_err << ")" << std::endl; + failed(""); + } +} +#define hipCheckErr(errval) \ + do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) + +static int cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + int multi_gpu_cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, + hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); + if (!multi_gpu_cooperative_attribute) { + std::cerr << "Multi-GPU cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + if (device_properties.cooperativeMultiDeviceLaunch == 0) { + std::cerr << "Multi-GPU cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +__global__ void test_coop_kernel(unsigned int loops, long long *array, + int fast_gpu) { + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + if (mgrid.grid_rank() == fast_gpu) { + return; + } + + for (int i = 0; i < loops; i++) { + long long start_clock = clock64(); + while (clock64() < (start_clock+1000000)) {} + array[rank] += clock64(); + } +} + +__global__ void test_kernel(uint32_t loops, unsigned long long *array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long start_clock = clock64(); + while (clock64() < (start_clock+1000000)) {} + array[rank] += clock64(); + } +} + +int main(int argc, char** argv) { + hipError_t err; + int device_num, FailFlag = 0; + uint32_t loops = 2000; + uint32_t fast_loops = 1; + int32_t fast_gpu = -1; + HIPCHECK(hipGetDeviceCount(&device_num)); + if (device_num < 2) { + std::cout << "This test requires atleast two gpus but the system has "; + std::cout << " only "<< device_num < max_blocks_per_sm * num_sm) { + std::cerr << "The requested number of blocks will not fit on the GPU"; + std::cerr << std::endl; + std::cerr << "You requested " << desired_blocks << " but we can only "; + std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl; + failed(""); + } + + /*************************************************************************/ + /* Create the streams we will use in this test. **************************/ + hipStream_t streams[2]; + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipStreamCreate(&streams[i])); + } + + /*************************************************************************/ + /* Set up data to pass into the kernelx **********************************/ + + // Alocate the host input buffer, and two device-focused buffers that we + // will use for our test. + unsigned long long *dev_array[2]; + for (int i = 0; i < 2; i++) { + int good_size = desired_blocks * warp_size * sizeof(long long); + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), good_size)); + HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i])); + } + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + void *dev_params[2][3]; + hipLaunchParams md_params[2]; + std::chrono::time_point start_time[6]; + std::chrono::time_point end_time[6]; + + std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n"; + std::cout << "This should result in the following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl; + std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl; + + for (int i = 0; i < 2; i++) { + dev_params[i][0] = reinterpret_cast(&loops); + dev_params[i][1] = reinterpret_cast(&dev_array[i]); + dev_params[i][2] = reinterpret_cast(&fast_gpu); + md_params[i].func = reinterpret_cast(test_coop_kernel); + md_params[i].gridDim = desired_blocks; + md_params[i].blockDim = warp_size; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i]; + } + + start_time[0] = std::chrono::system_clock::now(); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[0] = std::chrono::system_clock::now(); + + std::cout << std::endl; + std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the "; + std::cout << "following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; + std::cout << "GPU " << (dev + 1) << ": --> Coop "; + std::cout << "--> Standard Kernel\n"; + fast_gpu = 1; + start_time[1] = std::chrono::system_clock::now(); + HIPCHECK(hipSetDevice(dev)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[0], loops, dev_array[0]); + HIPCHECK(hipGetLastError()); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + HIPCHECK(hipSetDevice(dev + 1)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[1], loops, dev_array[1]); + HIPCHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + fast_gpu = -1; + + std::cout << std::endl; + std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the "; + std::cout << "following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl; + std::cout << "GPU " << (dev + 1) << ": --> Long Coop"; + std::cout << " Kernel --> "; + std::cout << "Standard Kernel\n"; + fast_gpu = 0; + start_time[2] = std::chrono::system_clock::now(); + HIPCHECK(hipSetDevice(dev)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[0], loops, dev_array[0]); + HIPCHECK(hipGetLastError()); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + HIPCHECK(hipSetDevice(dev + 1)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[1], loops, dev_array[1]); + HIPCHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[2] = std::chrono::system_clock::now(); + fast_gpu = -1; + + std::cout << std::endl; + std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the "; + std::cout << "ability to overlap regular and cooperative kernels "; + std::cout << "only at the beginning." << std::endl; + std::cout << "This should result in the following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl; + std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard"; + std::cout<< " Kernel\n"; + fast_gpu = 0; + start_time[3] = std::chrono::system_clock::now(); + HIPCHECK(hipSetDevice(dev)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[0], loops, dev_array[0]); + HIPCHECK(hipGetLastError()); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, + hipCooperativeLaunchMultiDeviceNoPreSync)); + HIPCHECK(hipSetDevice(dev + 1)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[1], loops, dev_array[1]); + HIPCHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[3] = std::chrono::system_clock::now(); + fast_gpu = -1; + + std::cout << std::endl; + std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the "; + std::cout << "ability to overlap regular and cooperative kernels "; + std::cout << "only at the end." << std::endl; + std::cout << "This should result in the following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; + std::cout << "GPU " << (dev + 1) << ": --> Coop --> "; + std::cout << "Standard Kernel\n"; + fast_gpu = 1; + start_time[4] = std::chrono::system_clock::now(); + HIPCHECK(hipSetDevice(dev)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[0], loops, dev_array[0]); + HIPCHECK(hipGetLastError()); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, + hipCooperativeLaunchMultiDeviceNoPostSync)); + HIPCHECK(hipSetDevice(dev + 1)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[1], loops, dev_array[1]); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[4] = std::chrono::system_clock::now(); + fast_gpu = -1; + + std::cout << std::endl; + std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the "; + std::cout << "ability to overlap regular and cooperative kernels"; + std::cout << std::endl; + std::cout << "This should result in the following pattern:" << std::endl; + std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; + std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard"; + std::cout << " Kernel\n"; + start_time[5] = std::chrono::system_clock::now(); + HIPCHECK(hipSetDevice(dev)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[0], loops, dev_array[0]); + HIPCHECK(hipGetLastError()); + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, + hipCooperativeLaunchMultiDeviceNoPreSync | + hipCooperativeLaunchMultiDeviceNoPostSync)); + HIPCHECK(hipSetDevice(dev + 1)); + hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + streams[1], loops, dev_array[1]); + HIPCHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice(dev + i)); + HIPCHECK(hipDeviceSynchronize()); + } + end_time[5] = std::chrono::system_clock::now(); + + std::chrono::duration single_kernel_time = + (end_time[0] - start_time[0]); + std::chrono::duration serialized_gpu0_time = + (end_time[1] - start_time[1]); + std::chrono::duration serialized_gpu1_time = + (end_time[2] - start_time[2]); + std::chrono::duration pre_overlapped_time = + (end_time[3] - start_time[3]); + std::chrono::duration post_overlapped_time = + (end_time[4] - start_time[4]); + std::chrono::duration overlapped_time = + (end_time[5] - start_time[5]); + + std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl; + std::cout << " " << single_kernel_time.count(); + std::cout << " seconds" << std::endl; + std::cout << std::endl; + std::cout << "Test 1: Serialized set of three kernels with GPU0"; + std::cout << " being long took:"; + std::cout << " " << serialized_gpu0_time.count(); + std::cout << " seconds" << std::endl; + std::cerr << "Expect between " << (2.7 * single_kernel_time.count()); + std::cerr << " and "; + std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; + std::cout << std::endl; + std::cout << "Test 2: Serialized set of three kernels with GPU1"; + std::cout << " being long took:" << std::endl; + std::cout << " " << serialized_gpu1_time.count(); + std::cout << " seconds" << std::endl; + std::cerr << "Expect between " << (2.7 * single_kernel_time.count()); + std::cerr << " and "; + std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; + std::cout << std::endl; + std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n"; + std::cout << " " << pre_overlapped_time.count(); + std::cout << " seconds" << std::endl; + std::cerr << "Expect between " << (1.7 * single_kernel_time.count()); + std::cerr << " and "; + std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; + std::cout << std::endl; + std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n"; + std::cout << " " << post_overlapped_time.count(); + std::cout << " seconds" << std::endl; + std::cerr << "Expect between " << (1.7 * single_kernel_time.count()); + std::cerr << " and "; + std::cerr << (2.3 * single_kernel_time.count()) << " seconds."; + std::cout << std::endl; + std::cout << "Test 5: Multiple kernels with overlap allowed took:\n"; + std::cout << " " << overlapped_time.count(); + std::cout << " seconds" << std::endl; + std::cerr << "Expect between " << (1.8 * single_kernel_time.count()); + std::cerr << " and "; + std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n"; + + // Test that fully not-overlapped kernels take roughly 3x as long as one + // cooperative kernel. + if (serialized_gpu0_time > 3.3 * single_kernel_time || + serialized_gpu0_time < 2.7 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Test 1, the first case where all kernels should be "; + std::cerr << "serialized, had a runtime that was very different "; + std::cerr << "than what was expected." << std::endl; + std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n"; + std::cerr << "Expected between "; + std::cerr << (2.7 * single_kernel_time.count()) << " and "; + std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; + std::cerr << "Were they truly serialized?" << std::endl; + FailFlag = 1; + } + + // Test that fully not-overlapped kernels take roughly 3x as long as one + // cooperative kernel. + if (serialized_gpu1_time > 3.3 * single_kernel_time || + serialized_gpu1_time < 2.7 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Test 2, the second case where all kernels should be "; + std::cerr << "serialized, had a runtime that was very different "; + std::cerr << "than what was expected." << std::endl; + std::cerr << "Was " << serialized_gpu1_time.count(); + std::cerr << " seconds." << std::endl; + std::cerr << "Expected between "; + std::cerr << (2.7 * single_kernel_time.count()) << " and "; + std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; + std::cerr << "Were they truly serialized?" << std::endl; + FailFlag = 1; + } + + // Test that kernels that can overlap only before the cooperative kernel + // launches kernels take roughly the same time (in this case) + if (pre_overlapped_time > 2.3 * single_kernel_time || + pre_overlapped_time < 1.7 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Test 3, the case where the last kernel is serialized, had "; + std::cerr << "a runtime that was very different than what was "; + std::cerr << "expected." << std::endl; + std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n"; + std::cerr << "Expected between "; + std::cerr << (1.7 * single_kernel_time.count()) << " and "; + std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; + FailFlag = 1; + } + + // Test that kernels that can overlap only after the cooperative kernel + // launches kernels take roughly the same time (in this case) + if (post_overlapped_time > 2.3 * single_kernel_time || + post_overlapped_time < 1.7 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Teste 4, the case where the first kernel is "; + std::cerr << "serialized, had a runtime that was very different "; + std::cerr << "than what was expected." << std::endl; + std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n"; + std::cerr << "Expected between "; + std::cerr << (1.7 * single_kernel_time.count()) << " and "; + std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; + FailFlag = 1; + } + + // Test that, with the right flags on the kernel launch, that we prevent + // incomplete launches from serializing the cooperative launch streams. + if (overlapped_time > 2.2 * single_kernel_time || + overlapped_time < 1.8 * single_kernel_time) { + std::cerr << "ERROR!" << std::endl; + std::cerr << "Test 5, the case where normal and cooperative kernel "; + std::cerr << "launches should overlap, does not appear to have done so."; + std::cerr << std::endl; + std::cerr << "Was " << overlapped_time.count() << " seconds.\n"; + std::cerr << "Expected between "; + std::cerr << (1.8 * single_kernel_time.count()) << " and "; + std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n"; + std::cerr << "Is the normal kernel being serialized with the "; + std::cerr << "cooperative kernels on different streams?" << std::endl; + FailFlag = 1; + } + for (int k = 0; k < 2; ++k) { + HIPCHECK(hipFree(dev_array[k])); + HIPCHECK(hipStreamDestroy(streams[k])); + } + if (FailFlag == 1) { + break; + } + } + if (FailFlag == 1) { + failed(""); + } else { + passed(); + } +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp new file mode 100644 index 0000000000..f2f9814dba --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp @@ -0,0 +1,374 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Test Description: +/*The general idea of the application is to launch N warps to all GPUs detected +in the HIP system. N is a command-line parameter, but the user should set N +small enough that all warps can be on each of the GPUs at the same time. + +All of the warps do a "work loop". Within the work loop, every warp +atomically increments a global variable that is shared between both fo the +target GPUs. The value returned from this atomic increment entriely depends +on the order the warps from the GPUs arrive at the atomic instruction. Each +warp then stores the result into a global array based on its warp ID. + +We also add a sleep/wait loop into the code so that the last warp runs much +slower than everyone else. As such, it should store much larger values than +all the other warps. + +If there are no barrier within the loop, then warp 0 will likely ge to the +global variable the first time while all the other warps have each +incremented it many times. If the barrier properly works, then each warp +will increment the variable once per time through the loop, and all threads +will sleep on the barrier waiting for the last warp to finally catch up. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60 + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +static int cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + int multi_gpu_cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, + hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); + if (!multi_gpu_cooperative_attribute) { + std::cerr << "Multi-GPU cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + if (device_properties.cooperativeMultiDeviceLaunch == 0) { + std::cerr << "Multi-GPU cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +static int verify_barrier_buffer(unsigned int loops, unsigned int warps, + unsigned int *host_buffer, + unsigned int num_devs) { + unsigned int max_in_this_loop = 0; + for (unsigned int i = 0; i < loops; i++) { + max_in_this_loop += (warps * num_devs); + for (unsigned int j = 0; j < warps; j++) { + if (host_buffer[i*warps+j] > max_in_this_loop) { + std::cerr << "Barrier failure!" << std::endl; + std::cerr << " Buffer entry " << i*warps+j; + std::cerr << " contains the value " << host_buffer[i*warps+j]; + std::cerr << " but it should not be more than "; + std::cerr << max_in_this_loop << std::endl; + return -1; + } + } + } + std::cout << "\tBarriers work properly!" << std::endl; + return 0; +} + +static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { + unsigned int desired_val = 0; + for (int i = 0; i < loops; i++) { + if (i % 2 == 0) { + desired_val += 2; + } else { + desired_val *= 2; + } + } + std::cout << "Desired value is " << desired_val << std::endl; + if (array_val != desired_val) { + std::cerr << "ERROR! Multi-grid barrier does not appear to work."; + std::cerr << std::endl; + std::cerr << "Expected the multi-GPUs to work together to produce "; + std::cerr << "the value " << desired_val << std::endl; + std::cerr << "However, the entry returned from the multi-GPU "; + std::cerr << "kernel was " << array_val << std::endl; + return -1; + } + std::cout << "\tMulti-GPU barriers appear to work here." << std::endl; + return 0; +} + +__global__ void +test_kernel(unsigned int *atomic_val, unsigned int *global_array, + unsigned int *array, uint32_t loops) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long start_clock = clock64(); + while (clock64() < (start_clock+1000000)) {} + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long start_clock = clock64(); + while (clock64() < (start_clock+100000000)) {} + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +} + +int main(int argc, char** argv) { + hipError_t err; + int num_devices = 0; + uint32_t loops = 2; + uint32_t warps = 10; + uint32_t block_size = 1; + + std::cout << "Loops: " << loops << std::endl; + std::cout << "Warps: " << warps << std::endl; + std::cout << "Block size: " << block_size << std::endl; + + HIPCHECK(hipGetDeviceCount(&num_devices)); + if (num_devices < 2) { + std::cout << "Not enough GPUs to run test." << std::endl; + std::cout << "We require at least 2 GPUs, but only found "; + std::cout << num_devices << std::endl; + std::cout << "Skipping the test with PASSED result\n"; + passed(); + } + + uint32_t device_num[num_devices]; + + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + for (int i = 0; i < num_devices; i++) { + device_num[i] = i; + if (!cooperative_groups_support(device_num[i])) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + } + + /*************************************************************************/ + /* Test whether the requested size will fit on the GPU *******************/ + int warp_sizes[num_devices]; + int num_sms[num_devices]; + hipDeviceProp_t device_properties[num_devices]; + int warp_size = INT_MAX; + int num_sm = INT_MAX; + for (int i = 0; i < num_devices; i++) { + HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i])); + warp_sizes[i] = device_properties[i].warpSize; + if (warp_sizes[i] < warp_size) { + warp_size = warp_sizes[i]; + } + num_sms[i] = device_properties[i].multiProcessorCount; + if (num_sms[i] < num_sm) { + num_sm = num_sms[i]; + } + std::cout << "Device " << (i + 1); + std::cout << " name: " << device_properties[i].name << std::endl; + } + std::cout << std::endl; + + int num_threads_in_block = block_size * warp_size; + + // Calculate the device occupancy to know how many blocks can be run. + int max_blocks_per_sm_arr[num_devices]; + int max_blocks_per_sm = INT_MAX; + for (int i = 0; i < num_devices; i++) { + HIPCHECK(hipSetDevice(device_num[i])); + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0)); + if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { + max_blocks_per_sm = max_blocks_per_sm_arr[i]; + } + } + + int requested_blocks = warps / block_size; + if (requested_blocks > max_blocks_per_sm * num_sm) { + std::cerr << "Requesting to run " << requested_blocks << " blocks, "; + std::cerr << "but we can only guarantee to simultaneously run "; + std::cerr << (max_blocks_per_sm * num_sm) << std::endl; + failed(""); + } + + /*************************************************************************/ + /* Set up data to pass into the kernel ***********************************/ + // Each block will output a single value per loop. + uint32_t total_buffer_len = requested_blocks*loops; + + // Alocate the buffer that will hold the kernel's output, and which will + // also be used to globally synchronize during GWS initialization + unsigned int *host_buffer[num_devices]; + unsigned int *kernel_buffer[num_devices]; + unsigned int *kernel_atomic[num_devices]; + hipStream_t streams[num_devices]; + for (int i = 0; i < num_devices; i++) { + host_buffer[i] = (unsigned int*)calloc(total_buffer_len, + sizeof(unsigned int)); + HIPCHECK(hipSetDevice(device_num[i])); + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer[i]), + total_buffer_len * sizeof(unsigned int))); + HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], + total_buffer_len * sizeof(unsigned int), + hipMemcpyHostToDevice)); + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic[i]), + sizeof(unsigned int))); + HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); + HIPCHECK(hipStreamCreate(&streams[i])); + } + + // Single kernel atomic shared between both devices; put it on the host + unsigned int* global_array; + HIPCHECK(hipHostMalloc(reinterpret_cast(&global_array), + num_devices * sizeof(unsigned int), 0)); + HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int))); + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + std::cout << "Launching a kernel with " << warps << " warps "; + std::cout << "in " << requested_blocks << " thread blocks."; + std::cout << std::endl; + + void *dev_params[num_devices][4]; + hipLaunchParams md_params[num_devices]; + for (int i = 0; i < num_devices; i++) { + dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); + dev_params[i][1] = reinterpret_cast(&global_array); + dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); + dev_params[i][3] = reinterpret_cast(&loops); + md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].gridDim = requested_blocks; + md_params[i].blockDim = num_threads_in_block; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i]; + } + + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0)); + HIPCHECK(hipDeviceSynchronize()); + + /*************************************************************************/ + /* Read back the buffers and print out its data **************************/ + for (int dev = 0; dev < num_devices; dev++) { + HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev], + total_buffer_len * sizeof(unsigned int), + hipMemcpyDeviceToHost)); + } + + for (unsigned int i = 0; i < loops; i++) { + for (int dev = 0; dev < num_devices; dev++) { + std::cout << "+++++++++++++++++ Device " << dev; + std::cout << "+++++++++++++++++" << std::endl; + for (unsigned int j = 0; j < requested_blocks; j++) { + std::cout << "Buffer entry " << (i*warps+j); + std::cout << " (written by warp " << j << ")"; + std::cout << " is " << host_buffer[dev][i*requested_blocks+j]; + std::cout << std::endl; + } + } + std::cout << "==========================\n"; + } + for (unsigned int dev = 0; dev < num_devices; dev++) { + std::cout << "Testing output from device " << dev << std::endl; + int local_ret_val = verify_barrier_buffer(loops, requested_blocks, + host_buffer[dev], num_devices); + if (local_ret_val) { + failed(""); + } + } + + std::cout << std::endl << "The multi-GPU shared updates contain:\n"; + for (int i = 0; i < num_devices; i++) { + std::cout << "Entry " << i << ": "; + std::cout << global_array[i] << std::endl; + } + int flag = 0; + for (int dev = 0; dev < num_devices; dev++) { + std::cout << "Testing multi-GPU output for entry " << dev << std::endl; + int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]); + if (local_ret_val) { + flag = 1; + } + } + for (int k = 0; k < num_devices; ++k) { + HIPCHECK(hipFree(kernel_buffer[k])); + HIPCHECK(hipFree(kernel_atomic[k])); + HIPCHECK(hipStreamDestroy(streams[k])); + free(host_buffer[k]); + } + if (flag == 1) { + failed(""); + } else { + passed(); + } +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp new file mode 100644 index 0000000000..77aa63d3c6 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp @@ -0,0 +1,233 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Test Description: +/*The general idea of the application is to launch N warps. N is a command-line +parameter, but the user should set N small enough that all warps can be on +the GPU at the same time. + +All of the warps do a "work loop". Within the work loop, every warp +atomically increments a global variable. The value returned from this atomic +increment entriely depends on the order the threads arrive at the atomic +instruction. Each warp then stores the result into a global array based on its +warp ID. + +We also add a sleep/wait loop into the code so that the last warp runs much +slower than everyone else. As such, it should store much larger values than +all the other warps. + +If there are no barrier within the loop, then the last warp will likely get to +the global variable the first time after all the other warps have each +incremented it many times. If the barrier properly works, then each warp +will increment the variable once per time through the loop, and all threads +will sleep on the barrier waiting for the last warp to finally catch up. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +static int cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +static int verify_barrier_buffer(unsigned int loops, unsigned int warps, + unsigned int *host_buffer) { + unsigned int max_in_this_loop = 0; + for (unsigned int i = 0; i < loops; i++) { + max_in_this_loop += warps; + for (unsigned int j = 0; j < warps; j++) { + if (host_buffer[i*warps+j] > max_in_this_loop) { + std::cerr << "Barrier failure!" << std::endl; + std::cerr << " Buffer entry " << i*warps+j; + std::cerr << " contains the value " << host_buffer[i*warps+j]; + std::cerr << " but it should not be more than "; + std::cerr << max_in_this_loop << std::endl; + return -1; + } + } + } + std::cout << "Barriers work properly!" << std::endl; + return 0; +} + +__global__ void +test_kernel(unsigned int *atomic_val, unsigned int *array, + unsigned int loops) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + unsigned rank = grid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long start_clock = clock64(); + while (clock64() < (start_clock+1000000)) {} + } + + if (threadIdx.x == 0) { + array[offset] = atomicInc(&atomic_val[0], UINT_MAX); + } + grid.sync(); + offset += gridDim.x; + } +} + +int main(int argc, char** argv) { + hipError_t err; + int device_num; + uint32_t loops = 2; + uint32_t warps = 10; + uint32_t block_size = 1; + HIPCHECK(hipGetDeviceCount(&device_num)); + for (int dev = 0; dev < device_num; ++dev) { + std::cout << "Device number: " << dev << std::endl; + std::cout << "Loops: " << loops << std::endl; + std::cout << "Warps: " << warps << std::endl; + std::cout << "Block size: " << block_size << std::endl; + + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + HIPCHECK(hipSetDevice(dev)); + if (!cooperative_groups_support(dev)) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + + /*************************************************************************/ + /* Test whether the requested size will fit on the GPU *******************/ + int warp_size; + int num_sms; + int max_blocks_per_sm; + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); + warp_size = device_properties.warpSize; + num_sms = device_properties.multiProcessorCount; + + std::cout << "Device name: " << device_properties.name << std::endl; + std::cout << std::endl; + + int num_threads_in_block = block_size * warp_size; + + // Calculate the device occupancy to know how many blocks can be run. + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, + test_kernel, num_threads_in_block, 0)); + + int requested_blocks = warps / block_size; + if (requested_blocks > max_blocks_per_sm * num_sms) { + std::cerr << "Requesting to run " << requested_blocks << " blocks, "; + std::cerr << "but we can only guarantee to simultaneously run "; + std::cerr << (max_blocks_per_sm * num_sms) << std::endl; + failed(""); + } + + /*************************************************************************/ + /* Set up data to pass into the kernel ***********************************/ + // Each block will output a single value per loop. + uint32_t total_buffer_len = requested_blocks*loops; + + // Alocate the buffer that will hold the kernel's output, and which will + // also be used to globally synchronize during GWS initialization + unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len, + sizeof(unsigned int)); + + unsigned int *kernel_buffer; + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer), + total_buffer_len * sizeof(unsigned int))); + HIPCHECK(hipMemcpy(kernel_buffer, host_buffer, + total_buffer_len * sizeof(unsigned int), + hipMemcpyHostToDevice)); + + unsigned int *kernel_atomic; + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic), + sizeof(unsigned int))); + HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int))); + + /*************************************************************************/ + /* Launch the kernel *****************************************************/ + std::cout << "Launching a kernel with " << warps << " warps "; + std::cout << "in " << requested_blocks << " thread blocks."; + std::cout << std::endl; + + void *params[3]; + params[0] = reinterpret_cast(&kernel_atomic); + params[1] = reinterpret_cast(&kernel_buffer); + params[2] = reinterpret_cast(&loops); + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + requested_blocks, + num_threads_in_block, params, 0, NULL)); + + /*************************************************************************/ + /* Read back the buffer and print out its data****************************/ + HIPCHECK(hipMemcpy(host_buffer, kernel_buffer, + total_buffer_len * sizeof(unsigned int), + hipMemcpyDeviceToHost)); + + for (unsigned int i = 0; i < loops; i++) { + for (unsigned int j = 0; j < requested_blocks; j++) { + std::cout << "Buffer entry " << (i*warps+j); + std::cout << " (written by warp " << j << ")"; + std::cout << " is " << host_buffer[i * requested_blocks + j]; + std::cout << std::endl; + } + std::cout << "==========================\n"; + } + int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer); + HIPCHECK(hipFree(kernel_buffer)); + HIPCHECK(hipFree(kernel_atomic)); + if (ret_val == -1) { + failed(""); + } else { + passed(); + } + } +} diff --git a/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp b/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp new file mode 100644 index 0000000000..ae793cf6a1 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp @@ -0,0 +1,374 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Test Description: +/*The general idea of the application is to launch N warps to each of two GPUs. +N is a command-line parameter, but the user should set N small enough that all +warps can be on each of the GPUs at the same time. + +All of the warps do a "work loop". Within the work loop, every warp +atomically increments a global variable that is shared between both fo the +target GPUs. The value returned from this atomic increment entriely depends +on the order the warps from the GPUs arrive at the atomic instruction. Each +warp then stores the result into a global array based on its warp ID. + +We also add a sleep/wait loop into the code so that the last warp runs much +slower than everyone else. As such, it should store much larger values than +all the other warps. + +If there are no barrier within the loop, then warp 0 will likely ge to the +global variable the first time while all the other warps have each +incremented it many times. If the barrier properly works, then each warp +will increment the variable once per time through the loop, and all threads +will sleep on the barrier waiting for the last warp to finally catch up. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60 + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +static int cooperative_groups_support(int device_id) { + hipError_t err; + int cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, + hipDeviceAttributeCooperativeLaunch, device_id)); + if (!cooperative_attribute) { + std::cerr << "Cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + int multi_gpu_cooperative_attribute; + HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, + hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); + if (!multi_gpu_cooperative_attribute) { + std::cerr << "Multi-GPU cooperative launch support not available in "; + std::cerr << "the device attribute for device " << device_id; + std::cerr << std::endl; + return 0; + } + + hipDeviceProp_t device_properties; + HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); + if (device_properties.cooperativeLaunch == 0) { + std::cerr << "Cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + if (device_properties.cooperativeMultiDeviceLaunch == 0) { + std::cerr << "Multi-GPU cooperative group support not available in "; + std::cerr << "device properties." << std::endl; + return 0; + } + return 1; +} + +static int verify_barrier_buffer(unsigned int loops, unsigned int warps, + unsigned int *host_buffer, + unsigned int num_devs) { + unsigned int max_in_this_loop = 0; + for (unsigned int i = 0; i < loops; i++) { + max_in_this_loop += (warps * num_devs); + for (unsigned int j = 0; j < warps; j++) { + if (host_buffer[i*warps+j] > max_in_this_loop) { + std::cerr << "Barrier failure!" << std::endl; + std::cerr << " Buffer entry " << i*warps+j; + std::cerr << " contains the value " << host_buffer[i*warps+j]; + std::cerr << " but it should not be more than "; + std::cerr << max_in_this_loop << std::endl; + return -1; + } + } + } + std::cout << "\tBarriers work properly!" << std::endl; + return 0; +} + +static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { + unsigned int desired_val = 0; + for (int i = 0; i < loops; i++) { + if (i % 2 == 0) { + desired_val += 2; + } else { + desired_val *= 2; + } + } + std::cout << "Desired value is " << desired_val << std::endl; + if (array_val != desired_val) { + std::cerr << "ERROR! Multi-grid barrier does not appear to work."; + std::cerr << std::endl; + std::cerr << "Expected the multi-GPUs to work together to produce "; + std::cerr << "the value " << desired_val << std::endl; + std::cerr << "However, the entry returned from the multi-GPU "; + std::cerr << "kernel was " << array_val << std::endl; + return -1; + } + std::cout << "\tMulti-GPU barriers appear to work here." << std::endl; + return 0; +} + +__global__ void +test_kernel(unsigned int *atomic_val, unsigned int *global_array, + unsigned int *array, uint32_t loops) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long start_clock = clock64(); + while (clock64() < (start_clock + 1000000)) {} + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long start_clock = clock64(); + while (clock64() < (start_clock + 100000000)) {} + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +} + +int main(int argc, char** argv) { + hipError_t err; + int device_num = 0, flag = 0; + uint32_t loops = 2; + uint32_t warps = 10; + uint32_t block_size = 1; + HIPCHECK(hipGetDeviceCount(&device_num)); + if (device_num < 2) { + std::cout << "This test needs atleast two gpus but found only"; + std::cout << device_num << std::endl; + std::cout << "Hence skipping the test with pass result\n"; + passed(); + } + + for (int d = 0; d < (device_num - 1); ++d) { + std::cout << "First device number: " << d << std::endl; + std::cout << "Second device number: " << (d + 1) << std::endl; + std::cout << "Loops: " << loops << std::endl; + std::cout << "Warps: " << warps << std::endl; + std::cout << "Block size: " << block_size << std::endl; + + /*************************************************************************/ + /* Test whether target device supports cooperative groups ****************/ + for (int i = 0; i < 2; i++) { + if (!cooperative_groups_support((d + i))) { + std::cout << "Skipping the test with Pass result.\n"; + passed(); + } + } + + /*************************************************************************/ + /* Test whether the requested size will fit on the GPU *******************/ + int warp_sizes[2]; + int num_sms[2]; + hipDeviceProp_t device_properties[2]; + int warp_size = INT_MAX; + int num_sm = INT_MAX; + for (int i = 0; i < 2; i++) { + HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i))); + warp_sizes[i] = device_properties[i].warpSize; + if (warp_sizes[i] < warp_size) { + warp_size = warp_sizes[i]; + } + num_sms[i] = device_properties[i].multiProcessorCount; + if (num_sms[i] < num_sm) { + num_sm = num_sms[i]; + } + std::cout << "Device " << (d + i); + std::cout << " name: " << device_properties[i].name << std::endl; + } + std::cout << std::endl; + + int num_threads_in_block = block_size * warp_size; + + // Calculate the device occupancy to know how many blocks can be run. + int max_blocks_per_sm_arr[2]; + int max_blocks_per_sm = INT_MAX; + for (int i = 0; i < 2; i++) { + HIPCHECK(hipSetDevice((d + i))); + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, + 0)); + if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { + max_blocks_per_sm = max_blocks_per_sm_arr[i]; + } + } + + int requested_blocks = warps / block_size; + if (requested_blocks > max_blocks_per_sm * num_sm) { + std::cerr << "Requesting to run " << requested_blocks << " blocks, "; + std::cerr << "but we can only guarantee to simultaneously run "; + std::cerr << (max_blocks_per_sm * num_sm) << std::endl; + failed(""); + } + + /*************************************************************************/ + /* Set up data to pass into the kernel ***********************************/ + // Each block will output a single value per loop. + uint32_t total_buffer_len = requested_blocks*loops; + + // Alocate the buffer that will hold the kernel's output, and which will + // also be used to globally synchronize during GWS initialization + unsigned int *host_buffer[2]; + unsigned int *kernel_buffer[2]; + unsigned int *kernel_atomic[2]; + hipStream_t streams[2]; + for (int i = 0; i < 2; i++) { + host_buffer[i] = (unsigned int*)calloc(total_buffer_len, + sizeof(unsigned int)); + HIPCHECK(hipSetDevice((d + i))); + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer[i]), + total_buffer_len * sizeof(unsigned int))); + HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], + total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice)); + HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic[i]), + sizeof(unsigned int))); + HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); + HIPCHECK(hipStreamCreate(&streams[i])); + } + + // Single kernel atomic shared between both devices; put it on the host + unsigned int* global_array; + HIPCHECK(hipHostMalloc(reinterpret_cast(&global_array), + 2 * sizeof(unsigned int), 0)); + HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int))); + + /*************************************************************************/ + /* Launch the kernels ****************************************************/ + std::cout << "Launching a kernel with " << warps << " warps "; + std::cout << "in " << requested_blocks << " thread blocks."; + std::cout << std::endl; + + void *dev_params[2][4]; + hipLaunchParams md_params[2]; + for (int i = 0; i < 2; i++) { + dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); + dev_params[i][1] = reinterpret_cast(&global_array); + dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); + dev_params[i][3] = reinterpret_cast(&loops); + md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].gridDim = requested_blocks; + md_params[i].blockDim = num_threads_in_block; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i]; + } + + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + HIPCHECK(hipDeviceSynchronize()); + + /*************************************************************************/ + /* Read back the buffers and print out its data **************************/ + for (int dev = 0; dev < 2; dev++) { + HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev], + total_buffer_len * sizeof(unsigned int), + hipMemcpyDeviceToHost)); + } + + for (unsigned int i = 0; i < loops; i++) { + for (int dev = 0; dev < 2; dev++) { + std::cout << "+++++++++++++++++ Device " << (d + dev); + std::cout << "+++++++++++++++++" << std::endl; + for (unsigned int j = 0; j < requested_blocks; j++) { + std::cout << "Buffer entry " << (i * warps + j); + std::cout << " (written by warp " << j << ")"; + std::cout << " is " << host_buffer[dev][i * requested_blocks + j]; + std::cout << std::endl; + } + } + std::cout << "==========================\n"; + } + for (unsigned int dev = 0; dev < 2; dev++) { + std::cout << "Testing output from device " << (d + dev) << std::endl; + int local_ret_val = verify_barrier_buffer(loops, requested_blocks, + host_buffer[dev], 2); + if (local_ret_val == -1) { + flag = 1; + } + } + + std::cout << std::endl << "The multi-GPU shared updates contain:"; + std::cout << std::endl; + for (int i = 0; i < 2; i++) { + std::cout << "Entry " << i << ": "; + std::cout << global_array[i] << std::endl; + } + for (int dev = 0; dev < 2; dev++) { + std::cout << "Testing multi-GPU output for entry " << (d + dev); + std::cout << std::endl; + int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]); + if (local_ret_val) { + flag = 1; + } + } + for (int k = 0; k < 2; ++k) { + HIPCHECK(hipFree(kernel_buffer[k])); + HIPCHECK(hipFree(kernel_atomic[k])); + HIPCHECK(hipStreamDestroy(streams[k])); + free(host_buffer[k]); + } + } + if (flag == 1) { + failed(""); + } else { + passed(); + } +} diff --git a/projects/hip/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp b/projects/hip/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp index 874f8bc44c..f7a9dac703 100644 --- a/projects/hip/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp +++ b/projects/hip/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp @@ -1,173 +1,173 @@ -/* - * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/* - * Test to compare - * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute ** - * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci ** - */ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc - * HIT_END - */ - -#include "test_common.h" -#define MAX_DEVICE_LENGTH 20 - -static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) { - for (int i = 0; i < deviceCount; i++) { - HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i)); - } - return true; -} - -bool comparePciBusIDWithHipDeviceGetAttribute() { - bool testResult = true; - int deviceCount = 0; - HIPCHECK(hipGetDeviceCount(&deviceCount)); - HIPASSERT(deviceCount != 0); - printf("No.of gpus in the system: %d\n", deviceCount); - char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - - getPciBusId(deviceCount, hipDeviceList); - - for (int i = 0; i < deviceCount; i++) { - int pciBusID = -1; - int pciDeviceID = -1; - int pciDomainID = -1; - int tempPciBusId = -1; - sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID, - &pciDeviceID); - HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i)); - if (pciBusID != tempPciBusId) { - testResult = false; - printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from " - "hipDeviceGetAttribute for gpu %d\n", i); - } - } - - printf("pciBusID output of both hipDeviceGetPCIBusId and" - " hipDeviceGetAttribute matched for all gpus\n"); - return testResult; -} - -bool compareHipDeviceGetPCIBusIdWithLspci() { - FILE *fpipe; - bool testResult = false; - - { - // Check if lspci is installed, if not, don't proceed - char const *cmd = "lspci --version"; - char *lspciCheck; - char temp[20]; - fpipe = popen(cmd, "r"); - - if (fpipe == nullptr) { - printf("Unable to create command file\n"); - return testResult; - } - - lspciCheck = fgets(temp, 20, fpipe); - pclose(fpipe); - - if (!lspciCheck) { - printf("lspci not found. Skipping the test\n"); - return true; - } - } - - int deviceCount = 0; - HIPCHECK(hipGetDeviceCount(&deviceCount)); - HIPASSERT(deviceCount != 0); - printf("No.of gpus in the system: %d\n", deviceCount); - char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - - getPciBusId(deviceCount, hipDeviceList); - - // Get lspci device list and compare with hip device list -#if defined(__CUDA_ARCH__) - char const *command = "lspci -D | grep controller | grep NVIDIA | " - "cut -d ' ' -f 1"; -#else - char const *command = "lspci -D | grep controller | grep AMD/ATI | " - "cut -d ' ' -f 1"; -#endif - fpipe = popen(command, "r"); - - if (fpipe == nullptr) { - printf("Unable to create command file\n"); - return testResult; - } - - int index = 0; - int deviceMatchCount = 0; - - while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) { - bool bMatchFound = false; - for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) { - if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) { - deviceMatchCount++; - bMatchFound = true; - } - } - if (bMatchFound == false) { - printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]); - } - index++; - } - - pclose(fpipe); - - if (deviceMatchCount == deviceCount) { - printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} " - "matched for all gpus\n"); - testResult = true; - } else { - printf("Mismatch in number GPUs reported by HIP with lscpi\n"); - } - return testResult; -} - -int main(int argc, char* argv[]) { - bool testResult = true; - HipTest::parseStandardArguments(argc, argv, true); - - if (p_tests & 0x1) { - testResult &= comparePciBusIDWithHipDeviceGetAttribute(); - } - - if (p_tests & 0x2) { -#ifdef __unix__ - testResult &= compareHipDeviceGetPCIBusIdWithLspci(); -#else - printf("Detected non-linux OS. Skipping the test\n"); -#endif - } - - if (testResult) { - passed(); - } else { - failed("one or more tests failed\n"); - } -} +/* + * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* + * Test to compare + * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute ** + * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci ** + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ + +#include "test_common.h" +#define MAX_DEVICE_LENGTH 20 + +static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) { + for (int i = 0; i < deviceCount; i++) { + HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i)); + } + return true; +} + +bool comparePciBusIDWithHipDeviceGetAttribute() { + bool testResult = true; + int deviceCount = 0; + HIPCHECK(hipGetDeviceCount(&deviceCount)); + HIPASSERT(deviceCount != 0); + printf("No.of gpus in the system: %d\n", deviceCount); + char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + + getPciBusId(deviceCount, hipDeviceList); + + for (int i = 0; i < deviceCount; i++) { + int pciBusID = -1; + int pciDeviceID = -1; + int pciDomainID = -1; + int tempPciBusId = -1; + sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID, + &pciDeviceID); + HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i)); + if (pciBusID != tempPciBusId) { + testResult = false; + printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from " + "hipDeviceGetAttribute for gpu %d\n", i); + } + } + + printf("pciBusID output of both hipDeviceGetPCIBusId and" + " hipDeviceGetAttribute matched for all gpus\n"); + return testResult; +} + +bool compareHipDeviceGetPCIBusIdWithLspci() { + FILE *fpipe; + bool testResult = false; + + { + // Check if lspci is installed, if not, don't proceed + char const *cmd = "lspci --version"; + char *lspciCheck; + char temp[20]; + fpipe = popen(cmd, "r"); + + if (fpipe == nullptr) { + printf("Unable to create command file\n"); + return testResult; + } + + lspciCheck = fgets(temp, 20, fpipe); + pclose(fpipe); + + if (!lspciCheck) { + printf("lspci not found. Skipping the test\n"); + return true; + } + } + + int deviceCount = 0; + HIPCHECK(hipGetDeviceCount(&deviceCount)); + HIPASSERT(deviceCount != 0); + printf("No.of gpus in the system: %d\n", deviceCount); + char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + + getPciBusId(deviceCount, hipDeviceList); + + // Get lspci device list and compare with hip device list +#if defined(__CUDA_ARCH__) + char const *command = "lspci -D | grep controller | grep NVIDIA | " + "cut -d ' ' -f 1"; +#else + char const *command = "lspci -D | grep controller | grep AMD/ATI | " + "cut -d ' ' -f 1"; +#endif + fpipe = popen(command, "r"); + + if (fpipe == nullptr) { + printf("Unable to create command file\n"); + return testResult; + } + + int index = 0; + int deviceMatchCount = 0; + + while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) { + bool bMatchFound = false; + for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) { + if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) { + deviceMatchCount++; + bMatchFound = true; + } + } + if (bMatchFound == false) { + printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]); + } + index++; + } + + pclose(fpipe); + + if (deviceMatchCount == deviceCount) { + printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} " + "matched for all gpus\n"); + testResult = true; + } else { + printf("Mismatch in number GPUs reported by HIP with lscpi\n"); + } + return testResult; +} + +int main(int argc, char* argv[]) { + bool testResult = true; + HipTest::parseStandardArguments(argc, argv, true); + + if (p_tests & 0x1) { + testResult &= comparePciBusIDWithHipDeviceGetAttribute(); + } + + if (p_tests & 0x2) { +#ifdef __unix__ + testResult &= compareHipDeviceGetPCIBusIdWithLspci(); +#else + printf("Detected non-linux OS. Skipping the test\n"); +#endif + } + + if (testResult) { + passed(); + } else { + failed("one or more tests failed\n"); + } +} diff --git a/projects/hip/tests/src/runtimeApi/device/hipSetGetDevice.cpp b/projects/hip/tests/src/runtimeApi/device/hipSetGetDevice.cpp index 4224c974b3..6c703de867 100644 --- a/projects/hip/tests/src/runtimeApi/device/hipSetGetDevice.cpp +++ b/projects/hip/tests/src/runtimeApi/device/hipSetGetDevice.cpp @@ -25,7 +25,7 @@ */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST_NAMED: %t hipSetGetDevice-invalidDevice * TEST_NAMED: %t hipSetGetDevice-allValidDevice * TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1 diff --git a/projects/hip/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp b/projects/hip/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp new file mode 100644 index 0000000000..00c01ab1cc --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp @@ -0,0 +1,227 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include +#include +#include +#include "test_common.h" + +#ifdef __linux__ +sem_t *sem_ob1 = NULL, *sem_ob2 = NULL; +typedef struct mem_handle { + int device; + hipIpcMemHandle_t memHandle; + bool IfTestPassed; +} hip_ipc_t; + +class IpcMemHandleTest { + public: + bool InitFlag = true; + hip_ipc_t *shrd_mem = NULL; + pid_t pid; + size_t N = 1024; + size_t Nbytes = N * sizeof(int); + int *A_d = NULL, out = 0; + int *A_h, *C_h; + int Num_devices = 0, Data_mismatch, CanAccessPeer = 0; + int *Ad1 = NULL, *Ad2 = NULL; + IpcMemHandleTest(); + bool Test(); + ~IpcMemHandleTest(); +}; + + +bool IpcMemHandleTest::Test() { + if (InitFlag == false) { + // Abort the test if the initialization fails + printf("Resource initialization failed. Hence test skipped!"); + return false; + } + pid = fork(); + if (pid != 0) { + // Parent process + HIPCHECK(hipGetDeviceCount(&Num_devices)); + for (int i = 0; i < Num_devices; ++i) { + if (shrd_mem->IfTestPassed == true) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle, + A_d)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + shrd_mem->device = i; + if ((out=sem_post(sem_ob1)) == -1) { + // Need to use inline function to release resources. + shrd_mem->IfTestPassed = false; + failed("sem_post() call failed in parent process."); + } + if ((out=sem_wait(sem_ob2)) == -1) { + shrd_mem->IfTestPassed = false; + failed("sem_wait() call failed in parent process."); + } + HIPCHECK(hipFree(A_d)); + } + } + } else { + // Child process + HIPCHECK(hipGetDeviceCount(&Num_devices)); + for (int j = 0; j < Num_devices; ++j) { + if ((out=sem_wait(sem_ob1)) == -1) { + shrd_mem->IfTestPassed = false; + printf("sem_wait() call failed in child process."); + if ((out=sem_post(sem_ob2)) == -1) { + printf("sem_post() call on sem_ob2 failed"); + exit(1); + } + } + for (int i = 0; i < Num_devices; ++i) { + Data_mismatch = 0; + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipMalloc(&Ad2, Nbytes)); + HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle, + hipIpcMemLazyEnablePeerAccess)); + HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device)); + if (CanAccessPeer == 1) { + HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice)); + HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice)); + for (int i = 0; i < N; ++i) { + if (C_h[i] != 123) + Data_mismatch++; + } + if (Data_mismatch != 0) { + printf("Data mismatch found when data copied from Ipc memhandle"); + printf(" to Device: %d\n", i); + shrd_mem->IfTestPassed = false; + } + memset(reinterpret_cast(C_h), 0, Nbytes); + // Checking if the data obtained from Ipc shared memory is consistent + HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost)); + for (int i = 0; i < N; ++i) { + if (C_h[i] != 123) + Data_mismatch++; + } + if (Data_mismatch != 0) { + printf("Data mismatch found when data copied from Ipc memhandle"); + printf(" Host.\n"); + shrd_mem->IfTestPassed = false; + } + } + HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast(Ad1))); + } + HIPCHECK(hipFree(Ad2)); + if ((out=sem_post(sem_ob2)) == -1) { + shrd_mem->IfTestPassed = false; + printf("sem_post() call on sem_ob2 failed"); + exit(1); + } + } + exit(0); + } + + if ((out = sem_unlink("/my-sem-object1")) == -1) { + printf("sem_unlink() call on /my-sem-object1 failed"); + } + if ((out = sem_unlink("/my-sem-object2")) == -1) { + printf("sem_unlink() call on /my-sem-object2 failed"); + } + int status; + waitpid(pid, &status, 0); + if (shrd_mem->IfTestPassed == false) { + return false; + } else { + return true; + } +} + +IpcMemHandleTest::IpcMemHandleTest() { + std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*"; + int res = system(cmd_line.c_str()); + if (res == -1) { + InitFlag = false; + printf("System call to remove existing shared objects failed!"); + } + int out; + if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) == + SEM_FAILED) { + InitFlag = false; + printf("Initialization of 1st semaphore object failed"); + } + if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) == + SEM_FAILED) { + InitFlag = false; + printf("Initialization of 2nd semaphore object failed"); + } + + shrd_mem = reinterpret_cast(mmap(NULL, sizeof(hip_ipc_t), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0, 0)); + if (shrd_mem == NULL) { + InitFlag = false; + printf("mmap() call failed!"); + } + shrd_mem->IfTestPassed = true; + A_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + for (size_t i = 0; i < N; i++) { + A_h[i] = 123; + } +} + +IpcMemHandleTest::~IpcMemHandleTest() { + munmap(shrd_mem, sizeof(hip_ipc_t)); + HIPCHECK(hipFree((A_d))); + free(A_h); + free(C_h); + HIPCHECK(hipFree((Ad1))); + HIPCHECK(hipFree((Ad2))); +} +#endif + +int main() { + bool IfTestPassed = true; + // The following program spawns a child process and does the following + // Parent iterate through each device, create memory -- create hipIpcMemhandle + // stores the mem handle in mmaped memory, release the child using sem_post() + // and wait for child to release itself(parent process) + // child process: + // Child process get the ipc mem handle using hipIpcOpenMemHandle + // Iterate through all the available gpus and do Device to Device copies + // and check for data consistencies and close the hipIpcCloseMemHandle + // release the parent and wait for parent to release itself(child) +#ifdef __linux__ + IpcMemHandleTest obj; + IfTestPassed = obj.Test(); +#else + printf("This is not a Linux platform. Hence Skipping the test!\n"); + IfTestPassed = true; +#endif + if (IfTestPassed == false) { + failed(""); + } + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp new file mode 100644 index 0000000000..934c364b6b --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp @@ -0,0 +1,487 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : + + (TestCase 1):: + 1) Test hipMalloc() api passing zero size and confirming *ptr returning + nullptr. Also pass nullptr to hipFree() api. + 2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate + error is returned. + 3) Check for hipMalloc() error code, passing invalid/null pointer. + + (TestCase 2):: + 4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation + with adequate number of iterations and later test for kernel execution on + default gpu. + 5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks + keeping maximum number of iterations and then run kernel code on default + gpu, perfom data validation. + + (TestCase 3):: + 6) Check hipMalloc() api adaptability when app creates small chunks of memory + continuously, stores it for later use and then frees it at later point + of time. + + (TestCase 4):: + 7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child + processes, validate the results. + + (TestCase 5):: + 8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple + child processes. Validate buffers allocated after running kernel code. + + (TestCase 6):: + 9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from + multiple threads and regress the api. + + (TestCases 2, 3, 4, 5, 6):: + 10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc() + api. Check for any possible memory leaks. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * TEST_NAMED: %t hipMalloc_ArgValidation --tests 1 + * TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2 + * TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3 + * TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4 + * TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5 + * TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6 + * HIT_END + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "test_common.h" + +/* Max alloc/free iterations for bigger chunks */ +#define MAX_ALLOCFREE_BC (10000) + +/* Buffer size for alloc/free cycles */ +#define BUFF_SIZE_AF (5*1024*1024) + +/* Max alloc/free iterations for smaller chunks */ +#define MAX_ALLOCFREE_SC (5000000) + +/* Max alloc and pool iterations (TBD) */ +#define MAX_ALLOCPOOL_ITER (2000000) + +/** + * Validates data consitency on supplied gpu + */ +bool validateMemoryOnGPU(int gpu) { + size_t Nbytes = N * sizeof(int); + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + size_t prevAvl, prevTot, curAvl, curTot; + bool TestPassed = true; + + HIPCHECK(hipSetDevice(gpu)); + HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot)); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), + 0, 0, static_cast(A_d), + static_cast(B_d), C_d, N); + + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) { + printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid()); + } else { + printf("%s : Validation FAILED for gpu %d from pid %d\n", + __func__, gpu, getpid()); + TestPassed &= false; + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipMemGetInfo(&curAvl, &curTot)); + + if ((prevAvl != curAvl) || (prevTot != curTot)) { + printf("%s : Memory allocation mismatch observed." + "Possible memory leak.", __func__); + TestPassed &= false; + } + + return TestPassed; +} + +/** + * Fetches Gpu device count + */ +void getDeviceCount(int *pdevCnt) { +#ifdef __linux__ + int fd[2], val = 0; + pid_t childpid; + + // create pipe descriptors + pipe(fd); + + // disable visible_devices env from shell + unsetenv("ROCR_VISIBLE_DEVICES"); + unsetenv("HIP_VISIBLE_DEVICES"); + + childpid = fork(); + + if (childpid > 0) { // Parent + close(fd[1]); + // parent will wait to read the device cnt + read(fd[0], &val, sizeof(val)); + + // close the read-descriptor + close(fd[0]); + + // wait for child exit + wait(NULL); + + *pdevCnt = val; + } else if (!childpid) { // Child + int devCnt = 1; + // writing only, no need for read-descriptor + close(fd[0]); + + HIPCHECK(hipGetDeviceCount(&devCnt)); + // send the value on the write-descriptor: + write(fd[1], &devCnt, sizeof(devCnt)); + + // close the write descriptor: + close(fd[1]); + exit(0); + } else { // failure + *pdevCnt = 1; + return; + } + +#else + HIPCHECK(hipGetDeviceCount(pdevCnt)); +#endif +} + +/** + * Regress memory allocation and free in loop + */ +bool regressAllocInLoop(int gpu) { + bool TestPassed = true; + size_t tot, avail, ptot, pavail; + int i = 0; + int *ptr; + + HIPCHECK(hipSetDevice(gpu)); + + // Exercise allocation in loop with bigger chunks + for (i = 0; i < MAX_ALLOCFREE_BC; i++) { + size_t numBytes = BUFF_SIZE_AF; + + HIPCHECK(hipMemGetInfo(&pavail, &ptot)); + HIPCHECK(hipMalloc(&ptr, numBytes)); + HIPCHECK(hipMemGetInfo(&avail, &tot)); + + if (pavail-avail != numBytes) { + printf("LoopAllocation : Memory allocation of %6.2fMB" + "not matching with hipMemGetInfo - FAIL\n", + numBytes/(1024.0*1024.0)); + TestPassed &= false; + HIPCHECK(hipFree(ptr)); + break; + } + + HIPCHECK(hipFree(ptr)); + } + + // Exercise allocation in loop with smaller chunks and max iters + HIPCHECK(hipMemGetInfo(&pavail, &ptot)); + + for (i = 0; i < MAX_ALLOCFREE_SC; i++) { + size_t numBytes = 16; + + HIPCHECK(hipMalloc(&ptr, numBytes)); + + HIPCHECK(hipFree(ptr)); + } + + HIPCHECK(hipMemGetInfo(&avail, &tot)); + + if ((pavail != avail) || (ptot != tot)) { + printf("LoopAllocation : Memory allocation mismatch observed." + "Possible memory leak."); + TestPassed &= false; + } + + return TestPassed; +} + +/* + * Thread func to regress alloc and check data consistency + */ + +std::atomic g_thTestPassed(true); + +void threadFunc(int gpu) { + g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu); + g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu); + + printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load()); +} + +int main(int argc, char* argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + + if (p_tests == 1) { // Arg validation + // Test hipMalloc for zero size + bool TestPassed = true; + int *ptr; + + HIPCHECK(hipMalloc(&ptr, 0)); + + // ptr expected to be reset to null ptr + if (ptr) { + printf("ArgValidation : Failed in zero size test\n"); + TestPassed &= false; + } + + // Free null ptr + HIPCHECK(hipFree(ptr)); + + // Test hipMalloc for invalid arguments + hipError_t ret; + + if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) { + printf("ArgValidation : Inappropritate error value returned" + " for invalid argument. Error: '%s'(%d)\n", + hipGetErrorString(ret), ret); + TestPassed &= false; + } + + // Test hipMalloc for Maximum value of size_t + if ((ret = hipMalloc(&ptr, std::numeric_limits::max())) + != hipErrorMemoryAllocation) { + printf("ArgValidation : Invalid error returned for max size_t." + " Error: '%s'(%d)\n", hipGetErrorString(ret), ret); + TestPassed &= false; + } + + if (TestPassed) { + passed(); + } else { + failed("hipMalloc ArgumentValidation Failure!"); + } + + } else if (p_tests == 2) { // Loop Regression Alloc/Free Cycle + bool TestPassed = true; + + TestPassed &= regressAllocInLoop(0); + TestPassed &= validateMemoryOnGPU(0); + + if (TestPassed) { + passed(); + } else { + failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!"); + } + + } else if (p_tests == 3) { // Loop Regression Alloc and Pool + size_t avail, tot, pavail, ptot; + bool TestPassed = true; + hipError_t err; + int *ptr; + + std::vector ptrlist; + + HIPCHECK(hipMemGetInfo(&pavail, &ptot)); + + // Allocate small chunks of memory million times + for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) { // Iterations TBD + if ((err = hipMalloc(&ptr, 10)) != hipSuccess) { + HIPCHECK(hipMemGetInfo(&avail, &tot)); + + printf("Loop regression pool allocation failure. " + "Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n", + tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err)); + + TestPassed &= false; + break; + } + + // Store pointers allocated to emulate memory pool of app + ptrlist.push_back(ptr); + } + + // Free ptrs at later point of time + for ( auto &t : ptrlist ) { + HIPCHECK(hipFree(t)); + } + + HIPCHECK(hipMemGetInfo(&avail, &tot)); + + TestPassed &= validateMemoryOnGPU(0); + + if ((pavail != avail) || (ptot != tot)) { + printf("%s : Memory allocation mismatch observed. Possible memory leak.", + __func__); + TestPassed &= false; + } + + if (TestPassed) { + passed(); + } else { + failed("hipMalloc_LoopRegression_AllocPool failure!"); + } + + } else if (p_tests == 4) { + bool TestPassed = true; + +#ifdef __linux__ + // Parallel execution of parent and child on gpu0 + int pid; + + if ((pid = fork()) < 0) { + printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid); + TestPassed &= false; + + } else if (!pid) { // Child process + bool TestPassedChild = true; + + TestPassedChild = validateMemoryOnGPU(0); + + if (TestPassedChild) { + exit(0); // child exit with success status + } else { + printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid()); + exit(1); // child exit with failure status + } + + } else { // Parent process + int exitStatus; + TestPassed = validateMemoryOnGPU(0); + + pid = wait(&exitStatus); + if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) ) + TestPassed &= false; + } +#else + printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n"); +#endif + + // TC scenarios specific to linux + // are treated as pass in windows. + if (TestPassed) { + passed(); + } else { + failed("hipMallocChild_Concurrency_DefaultGpu Failed!"); + } + + } else if (p_tests == 5) { + bool TestPassed = true; +#ifdef __linux__ + // Parallel execution on multiple gpus from different child processes + int devCnt = 1, pid = 0, cumStatus = 0; + + // Get GPU count + getDeviceCount(&devCnt); + + // Spawn child for each GPU + for (int gpu = 0; gpu < devCnt; gpu++) { + if ((pid = fork()) < 0) { + printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid); + failed("Test Failed!"); + + } else if (!pid) { // Child process + bool TestPassedChild = true; + TestPassedChild = validateMemoryOnGPU(gpu); + + if (TestPassedChild) { + exit(0); // child exit with success status + } else { + printf("Child_Concurrency_MultiGpu : childpid %d failed\n", + getpid()); + exit(1); // child exit with failure status + } + } + } + + // Parent shall wait for child to complete + for (int i = 0; i < devCnt; i++) { + int pidwait = 0, exitStatus; + pidwait = wait(&exitStatus); + + if (pidwait < 0) { + TestPassed &= false; + break; + } + + cumStatus |= WEXITSTATUS(exitStatus); + } + + // Cummulative status of all child + if (cumStatus) { + TestPassed &= false; + } + +#else + printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n"); +#endif + + + // TC scenarios specific to linux + // are treated as pass in windows. + if (TestPassed) { + passed(); + } else { + failed("hipMallocChild_Concurrency_MultiGpu Failed!"); + } + + } else if (p_tests == 6) { // Multithreaded multiple gpu execution + std::vector threadlist; + int devCnt = 1; + + // Get GPU count + getDeviceCount(&devCnt); + + + for (int i = 0; i < devCnt; i++) { + threadlist.push_back(std::thread(threadFunc, i)); + } + + for (auto &t : threadlist) { + t.join(); + } + + if (g_thTestPassed) { + passed(); + } else { + failed("hipMalloc_MultiThreaded_MultiGpu Failed!"); + } + } else { + failed("Didnt receive any valid option. Try options 1 to 6\n"); + } +} + diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp new file mode 100644 index 0000000000..25820e2305 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp @@ -0,0 +1,423 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* Test 6 is disabled */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * TEST_NAMED: %t hipMallocManaged1 --tests 1 + * TEST_NAMED: %t hipMallocManaged2 --tests 2 + * TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3 + * TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4 + * TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc + * TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc + * HIT_END + */ + +#include +#include "test_common.h" +#define N 1048576 // equals to (1024*1024) +#define INIT_VAL 123 + +/* + * Kernel function to perform addition operation. + */ +template +__global__ void +vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < NUM_ELMTS; i += stride) { + Ad2[i] = Ad1[i] + Ad1[i]; + } +} + +// The following Test case tests the following scenario: +// A large chunk of hipMallocManaged() memory(Hmm) is created +// Equal parts of Hmm is accessed on available gpus and +// kernel is launched on acessed chunk of hmm memory +// and checks if there are any inconsistencies or access issues +bool MultiChunkMultiDevice(int NumDevices) { + std::atomic DataMismatch{0}; + bool IfTestPassed = true; + int Counter = 0; + unsigned int NUM_ELMS = (1024 * 1024); + float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS]; + hipStream_t stream[NumDevices]; + for (int Oloop = 0; Oloop < NumDevices; ++Oloop) { + HIPCHECK(hipSetDevice(Oloop)); + HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float))); + HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float))); + HIPCHECK(hipStreamCreate(&stream[Oloop])); + } + HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float)))); + for (int i = 0; i < NumDevices; ++i) { + for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) { + Hmm[Counter] = INIT_VAL + i; + } + } + const unsigned threadsPerBlock = 256; + const unsigned blocks = (NUM_ELMS + 255)/256; + for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) { + vector_sum <<>> + (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS); + } + HIPCHECK(hipDeviceSynchronize()); + for (int m = 0; m < NumDevices; ++m) { + HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), + hipMemcpyDeviceToHost)); + for (int n = 0; n < NUM_ELMS; ++n) { + if (Ah[n] != ((INIT_VAL + m) * 2)) { + DataMismatch++; + } + } + memset(reinterpret_cast(Ah), 0, NUM_ELMS * sizeof(float)); + } + if (DataMismatch.load() != 0) { + printf("MultiChunkMultiDevice: Mismatch observed!\n"); + IfTestPassed = false; + } + for (int i = 0; i < NumDevices; ++i) { + HIPCHECK(hipFree(Ad[i])); + HIPCHECK(hipStreamDestroy(stream[i])); + } + HIPCHECK(hipFree(Hmm)); + free(Ah); + return IfTestPassed; +} + +// The following Test case tests the following scenario: +// A large chunk of hipMallocManaged() memory(Hmm) is created +// Equal parts of Hmm is accessed and +// kernel is launched on acessed chunk of hmm memory +// and checks if there are any inconsistencies or access issues + +bool MultiChunkSingleDevice(int NumDevices) { + std::atomic DataMismatch{0}; + int Chunks = 4, Counter = 0; + bool IfTestPassed = true; + unsigned int NUM_ELMS = (1024 * 1024); + float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS]; + hipStream_t stream[Chunks]; + for (int i = 0; i < Chunks; ++i) { + HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float))); + HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float))); + HIPCHECK(hipStreamCreate(&stream[i])); + } + HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float)))); + for (int i = 0; i < Chunks; ++i) { + for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) { + Hmm[Counter] = (INIT_VAL + i); + } + } + const unsigned threadsPerBlock = 256; + const unsigned blocks = (NUM_ELMS + 255)/256; + for (int k = 0; k < Chunks; ++k) { + vector_sum <<>> + (&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS); + } + HIPCHECK(hipDeviceSynchronize()); + for (int m = 0; m < Chunks; ++m) { + HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), + hipMemcpyDeviceToHost)); + for (int n = 0; n < NUM_ELMS; ++n) { + if (Ah[n] != ((INIT_VAL + m) * 2)) { + DataMismatch++; + } + } + } + if (DataMismatch.load() != 0) { + printf("MultiChunkSingleDevice: Mismatch observed!\n"); + IfTestPassed = false; + } + for (int i = 0; i < Chunks; ++i) { + HIPCHECK(hipFree(Ad[i])); + HIPCHECK(hipStreamDestroy(stream[i])); + } + HIPCHECK(hipFree(Hmm)); + free(Ah); + return IfTestPassed; +} + +// The following tests oversubscription hipMallocManaged() api +// Currently disabled. +bool TestOversubscriptionMallocManaged(int NumDevices) { + bool IfTestPassed = true; + hipError_t err; + void *A = NULL; + size_t total = 0, free = 0; + HIPCHECK(hipMemGetInfo(&free, &total)); + // ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook + // into how out of memory can be tested. + // Demanding more mem size than available + err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal); + if (hipErrorOutOfMemory != err) { + printf("hipMallocManaged: Returned %s for size value > device memory\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + return IfTestPassed; +} + +// The following test does negative testing of hipMallocManaged() api +// by passing invalid values and check if the behavior is as expected +bool NegativeTestsMallocManaged(int NumDevices) { + bool IfTestPassed = true; + hipError_t err; + void *A = NULL; + size_t total = 0, free = 0; + HIPCHECK(hipMemGetInfo(&free, &total)); + + err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s when devPtr is null\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + err = hipMallocManaged(&A, 0, hipMemAttachGlobal); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s when size is 0\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + err = hipMallocManaged(NULL, 0, hipMemAttachGlobal); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + +#ifdef __HIP_PLATFORM_HCC__ + // The flag hipMemAttachHost is currently not supported therefore + // api should return "hipErrorInvalidValue" for now + err = hipMallocManaged(&A, 1024, hipMemAttachHost); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n", + hipGetErrorString(err)); + IfTestPassed = false; + } +#endif // __HIP_PLATFORM_HCC__ + + err = hipMallocManaged(NULL, 0, 0); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s when params are null, 0, 0\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + err = hipMallocManaged(&A, 1024, 145); + if (hipErrorInvalidValue != err) { + printf("hipMallocManaged: Returned %s when flag param is numerical 145\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + err = hipMallocManaged(&A, -10, hipMemAttachGlobal); + if (hipErrorOutOfMemory != err) { + printf("hipMallocManaged: Returned %s for negative size value.\n", + hipGetErrorString(err)); + IfTestPassed = false; + } + + return IfTestPassed; +} + + +// Allocate two pointers using hipMallocManaged(), initialize, +// then launch kernel using these pointers directly and +// later validate the content without using any Memcpy. +template +bool TestMallocManaged2(int NumDevices) { + bool IfTestPassed = true; + T *Hmm1 = NULL, *Hmm2 = NULL; + + for (int i = 0; i < NumDevices; ++i) { + HIPCHECK(hipSetDevice(i)); + std::atomic DataMismatch{0}; + HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T))); + HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T))); + for (int m = 0; m < N; ++m) { + Hmm1[m] = m; + Hmm2[m] = 0; + } + const unsigned threadsPerBlock = 256; + const unsigned blocks = (N + 255)/256; + // Kernel launch + vector_sum <<>> (Hmm1, Hmm2, N); + HIPCHECK(hipDeviceSynchronize()); + for (int v = 0; v < N; ++v) { + if (Hmm2[v] != (v + v)) { + DataMismatch++; + } + } + if (DataMismatch.load() != 0) { + IfTestPassed = false; + } + HIPCHECK(hipFree(Hmm1)); + HIPCHECK(hipFree(Hmm2)); + } + return IfTestPassed; +} + +// In the following test, a memory is created using hipMallocManaged() by +// setting a device and verified if it is accessible when the context is set +// to all other devices. This include verification and Device two Device +// transfers and kernel launch o discover if there any access issues. + +template +bool TestMallocManaged1(int NumDevices) { + std::atomic DataMismatch; + bool TestPassed = true; + T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL; + + for (int i =0; i < N; ++i) { + Ah1[i] = INIT_VAL; + Ah2[i] = 0; + } + for (int Oloop = 0; Oloop < NumDevices; ++Oloop) { + DataMismatch = 0; + HIPCHECK(hipSetDevice(Oloop)); + HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T))); + for (int Iloop = 0; Iloop < NumDevices; ++Iloop) { + HIPCHECK(hipSetDevice(Iloop)); + HIPCHECK(hipMalloc(&Ad, N * sizeof(T))); + // Copy data from host to hipMallocMananged memory and verify + HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice)); + for (int v = 0; v < N; ++v) { + if (Hmm[v] != INIT_VAL) { + DataMismatch++; + } + } + if (DataMismatch.load() != 0) { + printf("Mismatch is observed with host data at device %d", Iloop); + printf(" while hipMallocManaged memory set to the device %d\n", Oloop); + TestPassed = false; + DataMismatch = 0; + } + // Executing D2D transfer with hipMallocManaged memory and verify + HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice)); + HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost)); + for (int k = 0; k < N; ++k) { + if (Ah2[k] != INIT_VAL) { + DataMismatch++; + } + } + if (DataMismatch.load() != 0) { + printf("Mismatch is observed with D2D transfer at device %d\n", Iloop); + printf(" while hipMallocManaged memory set to the device %d\n", Oloop); + TestPassed = false; + DataMismatch = 0; + } + HIPCHECK(hipMemset(Ad, 0, N * sizeof(T))); + const unsigned threadsPerBlock = 256; + const unsigned blocks = (N + 255)/256; + // Launching the kernel to check if there is any access issue with + // hipMallocManaged memory and local device's memory + vector_sum <<>> (Hmm, Ad, N); + hipDeviceSynchronize(); + HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost)); + for (int m = 0; m < N; ++m) { + if (Ah2[m] != 246) { + DataMismatch++; + } + } + if (DataMismatch.load() != 0) { + printf("Data Mismatch observed after kernel lch device %d\n", Iloop); + TestPassed = false; + DataMismatch = 0; + } + HIPCHECK(hipFree(Ad)); + } + HIPCHECK(hipFree(Hmm)); + } + free(Ah1); + free(Ah2); + return TestPassed; +} + +int main(int argc, char* argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + + if ((p_tests <= 0) || (p_tests > 5)) { + failed("Valid arguments are from 1 to 5"); + } + + int NumDevices = 0; + HIPCHECK(hipGetDeviceCount(&NumDevices)); + bool TestStatus = true, OverAllStatus = true; + if (p_tests == 1) { + TestStatus = TestMallocManaged1(NumDevices); + if (!TestStatus) { + printf("Test Failed with float datatype.\n"); + OverAllStatus = false; + } + TestStatus = TestMallocManaged1(NumDevices); + if (!TestStatus) { + printf("Test Failed with int datatype.\n"); + OverAllStatus = false; + } + TestStatus = TestMallocManaged1(NumDevices); + if (!TestStatus) { + printf("Test Failed with unsigned char datatype.\n"); + OverAllStatus = false; + } + TestStatus = TestMallocManaged1(NumDevices); + if (!TestStatus) { + printf("Test Failed with double datatype.\n"); + OverAllStatus = false; + } + if (!OverAllStatus) { + failed(""); + } + } + if (p_tests == 2) { + TestStatus = TestMallocManaged2(NumDevices); + if (!TestStatus) { + failed("Test Failed with float datatype."); + } + } + if (p_tests == 3) { + TestStatus = NegativeTestsMallocManaged(NumDevices); + if (!TestStatus) { + failed("Negative Tests with hipMallocManaged() failed!."); + } + } + if (p_tests == 4) { + TestStatus = MultiChunkSingleDevice(NumDevices); + if (!TestStatus) { + failed("hipMallocManaged: MultiChunkSingleDevice test failed!"); + } + } + if (p_tests == 5) { + TestStatus = MultiChunkMultiDevice(NumDevices); + if (!TestStatus) { + failed("hipMallocManaged: MultiChunkMultiDevice test failed!"); + } + } + if (p_tests == 6) { + TestStatus = TestOversubscriptionMallocManaged(NumDevices); + if (!TestStatus) { + failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!"); + } + } + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp index 7e65c47244..b3dc32810b 100755 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp @@ -75,6 +75,9 @@ int main() { HIPCHECK(hipFree(Z_d)); } else { std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"< +#include "test_common.h" + +int main() { + hipSharedMemConfig_t config; + HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL)); + HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config)); +} diff --git a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp index cc976ced42..6f649708b7 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp +++ b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 840e9b6975..dbf58209fa 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp index 07acc4a591..3ee2f4a050 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp index 3a25d3331c..0d65a0f50b 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp @@ -21,7 +21,7 @@ // kernel. Verify that all the kernels queued are executed before the callback. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp index a182c85010..a98fbb87c7 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp @@ -21,7 +21,7 @@ // when hipStreamAddCallback() is called back to back multiple calls /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp index d21ea5da54..fb93268176 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp @@ -22,7 +22,7 @@ // by hipStreamAddCallback() api. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp index 2eef534ea4..5e9b75adee 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp @@ -22,7 +22,7 @@ // finish. Ideally Host thread should not wait for callback to finish. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_order.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_order.cpp index 7b66441fa6..f7d8a866f2 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_order.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamACb_order.cpp @@ -18,7 +18,7 @@ * */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp index 8da2c2f8a5..49991eec20 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp @@ -19,7 +19,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h old mode 100644 new mode 100755 index 8897dc938e..21a4c45ac8 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -41,7 +41,6 @@ THE SOFTWARE. #define HC __attribute__((hc)) - #define KNRM "\x1B[0m" #define KRED "\x1B[31m" #define KGRN "\x1B[32m" @@ -51,6 +50,19 @@ THE SOFTWARE. #define KCYN "\x1B[36m" #define KWHT "\x1B[37m" + // HIP Skip Return code set at cmake +#define HIP_SKIP_RETURN_CODE 127 +#define HIP_ENABLE_SKIP_TESTS 0 + +inline bool hip_skip_tests_enabled() { + return HIP_ENABLE_SKIP_TESTS; +} + +inline int hip_skip_retcode() { + // HIP Skip Return code set at cmake + return HIP_SKIP_RETURN_CODE; +} + #define passed() \ printf("%sPASSED!%s\n", KGRN, KNRM); \ exit(0); diff --git a/projects/hip/tests/unit/test_common.h b/projects/hip/tests/unit/test_common.h old mode 100644 new mode 100755 index 4b55c70164..ae6f1cba04 --- a/projects/hip/tests/unit/test_common.h +++ b/projects/hip/tests/unit/test_common.h @@ -41,7 +41,6 @@ THE SOFTWARE. #define HC __attribute__((hc)) - #define KNRM "\x1B[0m" #define KRED "\x1B[31m" #define KGRN "\x1B[32m"