From 374ead1d1934636fb0b0aecc322b5bf8c3b86c47 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Mon, 5 Oct 2020 12:26:18 -0400 Subject: [PATCH] Revert "Merge branch 'amd-master-next' into amd-npi-next" This reverts commit 73558e33632379e2e6eb157560aaff60e0405f31. Reason for revert: Change-Id: I53322718dadde2c98f96140b8e260ec7ee9ef721 --- CMakeLists.txt | 52 +- CONTRIBUTING.md | 104 +-- bin/hip_embed_pch.sh | 9 +- bin/hip_gen_pch.sh | 36 + bin/hipcc | 3 +- docs/markdown/hip_faq.md | 2 +- hip-config.cmake.in | 28 +- include/hip/hcc_detail/device_functions.h | 138 +--- include/hip/hcc_detail/hip_runtime.h | 16 - include/hip/hcc_detail/hip_runtime_api.h | 13 +- include/hip/hip_cooperative_groups.h | 9 +- include/hip/hip_runtime_api.h | 1 - .../hip/nvcc_detail/hip_cooperative_groups.h | 12 - include/hip/nvcc_detail/hip_runtime.h | 4 +- include/hip/nvcc_detail/hip_runtime_api.h | 1 - lpl_ca/CMakeLists.txt | 2 - packaging/hip-base.txt | 11 +- packaging/hip-doc.txt | 15 +- packaging/hip-hcc.txt | 13 +- packaging/hip-nvcc.txt | 13 +- packaging/hip-rocclr.txt | 13 +- packaging/hip-samples.txt | 13 +- rocclr/CMakeLists.txt | 30 +- rocclr/hip_code_object.cpp | 25 + rocclr/hip_code_object.hpp | 2 + rocclr/hip_device.cpp | 2 +- rocclr/hip_fatbin.cpp | 7 +- rocclr/hip_global.cpp | 4 +- rocclr/hip_global.hpp | 5 + rocclr/hip_internal.hpp | 2 + rocclr/hip_memory.cpp | 67 +- rocclr/hip_module.cpp | 2 +- rocclr/hip_peer.cpp | 4 - rocclr/hip_platform.cpp | 41 +- rocclr/hip_platform.hpp | 5 + samples/0_Intro/bit_extract/CMakeLists.txt | 20 - samples/0_Intro/bit_extract/Makefile | 10 +- samples/0_Intro/module_api/CMakeLists.txt | 36 - .../0_Intro/module_api_global/CMakeLists.txt | 30 - samples/0_Intro/square/CMakeLists.txt | 21 - samples/0_Intro/square/Makefile | 7 +- samples/0_Intro/square/README.md | 42 +- .../1_Utils/hipBusBandwidth/CMakeLists.txt | 20 - .../hipBusBandwidth/hipBusBandwidth.cpp | 446 ++++++----- samples/1_Utils/hipCommander/CMakeLists.txt | 31 - .../1_Utils/hipDispatchLatency/CMakeLists.txt | 35 - samples/1_Utils/hipInfo/CMakeLists.txt | 20 - .../0_MatrixTranspose/CMakeLists.txt | 20 - .../2_Cookbook/10_inline_asm/CMakeLists.txt | 20 - .../11_texture_driver/CMakeLists.txt | 30 - .../2_Cookbook/13_occupancy/CMakeLists.txt | 20 - samples/2_Cookbook/1_hipEvent/CMakeLists.txt | 20 - .../2_Cookbook/3_shared_memory/CMakeLists.txt | 20 - samples/2_Cookbook/4_shfl/CMakeLists.txt | 20 - samples/2_Cookbook/5_2dshfl/CMakeLists.txt | 19 - .../6_dynamic_shared/CMakeLists.txt | 19 - samples/2_Cookbook/7_streams/CMakeLists.txt | 19 - samples/2_Cookbook/8_peer2peer/CMakeLists.txt | 19 - samples/2_Cookbook/9_unroll/CMakeLists.txt | 19 - samples/README.md | 27 - tests/hit/HIT.cmake | 1 - .../performance/compute/hipPerfMandelbrot.cpp | 747 ------------------ .../stream/hipPerfDeviceConcurrency.cpp | 289 ------- .../hipCGGridGroupType.cpp | 8 +- .../hipCGGridGroupTypeViaBaseType.cpp | 8 +- .../hipCGGridGroupTypeViaPublicApi.cpp | 8 +- .../hipCGMultiGridGroupType.cpp | 22 +- .../hipCGMultiGridGroupTypeViaBaseType.cpp | 39 +- .../hipCGMultiGridGroupTypeViaPublicApi.cpp | 39 +- .../hipCGThreadBlockType.cpp | 12 +- .../hipCGThreadBlockTypeViaBaseType.cpp | 12 +- .../hipCGThreadBlockTypeViaPublicApi.cpp | 12 +- tests/src/kernel/hipShflTests.cpp | 17 +- tests/src/kernel/hipShflUpDownTest.cpp | 64 +- tests/src/p2p/hipPeerToPeer_simple.cpp | 3 - .../cooperativeGrps/api_failure_tests.cpp | 280 ------- .../cooperativeGrps/cooperative_streams.cpp | 283 ------- .../grid_group_data_sharing.cpp | 303 ------- .../multi_gpu_api_failure_tests.cpp | 568 ------------- .../cooperativeGrps/multi_gpu_streams.cpp | 581 -------------- .../multi_grid_group_all_gpus.cpp | 374 --------- .../simple_grid_group_barrier.cpp | 233 ------ .../simple_multi_grid_group_barrier.cpp | 374 --------- .../device/hipDeviceGetPCIBusId.cpp | 346 ++++---- .../src/runtimeApi/device/hipSetGetDevice.cpp | 2 +- .../runtimeApi/memory/hipIpcMemAccessTest.cpp | 227 ------ .../memory/hipMallocConcurrency.cpp | 487 ------------ .../memory/hipMallocManaged_MultiScenario.cpp | 423 ---------- tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp | 3 - .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp | 3 - tests/src/runtimeApi/memory/hipMemcpyPeer.cpp | 3 - .../runtimeApi/memory/hipMemcpyPeerAsync.cpp | 3 - .../runtimeApi/memory/hipMemcpyWithStream.cpp | 2 +- .../memory/hipMemcpyWithStreamMultiThread.cpp | 2 +- tests/src/runtimeApi/memory/hipMemset2D.cpp | 2 +- .../hipMultiMemcpyMultiThrdMultiStrm.cpp | 2 +- .../memory/hipMultiMemcpyMultiThread.cpp | 2 +- .../module/hipFuncSetSharedMemConfig.cpp | 27 - .../hipLaunchCoopMultiKernel.cpp | 2 +- .../hipLaunchCooperativeKernel.cpp | 7 +- .../hipModuleLoadDataMultThreadOnMultGPU.cpp | 2 +- .../module/hipModuleLoadDataMultThreaded.cpp | 2 +- .../stream/hipStreamACb_AltEnqueue.cpp | 2 +- .../stream/hipStreamACb_MStrm_Mgpu.cpp | 2 +- .../stream/hipStreamACb_MultiCalls.cpp | 2 +- .../stream/hipStreamACb_StrmSyncTiming.cpp | 2 +- .../stream/hipStreamACb_ThrdBehaviour.cpp | 2 +- .../runtimeApi/stream/hipStreamACb_order.cpp | 2 +- .../stream/hipStreamGetPriority.cpp | 2 +- tests/src/test_common.h | 14 +- tests/unit/test_common.h | 1 + 111 files changed, 750 insertions(+), 6797 deletions(-) create mode 100755 bin/hip_gen_pch.sh delete mode 100644 include/hip/nvcc_detail/hip_cooperative_groups.h delete mode 100644 samples/0_Intro/bit_extract/CMakeLists.txt delete mode 100644 samples/0_Intro/module_api/CMakeLists.txt delete mode 100644 samples/0_Intro/module_api_global/CMakeLists.txt delete mode 100644 samples/0_Intro/square/CMakeLists.txt delete mode 100644 samples/1_Utils/hipBusBandwidth/CMakeLists.txt delete mode 100644 samples/1_Utils/hipCommander/CMakeLists.txt delete mode 100644 samples/1_Utils/hipDispatchLatency/CMakeLists.txt delete mode 100644 samples/1_Utils/hipInfo/CMakeLists.txt delete mode 100644 samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt delete mode 100644 samples/2_Cookbook/10_inline_asm/CMakeLists.txt delete mode 100644 samples/2_Cookbook/11_texture_driver/CMakeLists.txt delete mode 100644 samples/2_Cookbook/13_occupancy/CMakeLists.txt delete mode 100644 samples/2_Cookbook/1_hipEvent/CMakeLists.txt delete mode 100644 samples/2_Cookbook/3_shared_memory/CMakeLists.txt delete mode 100644 samples/2_Cookbook/4_shfl/CMakeLists.txt delete mode 100644 samples/2_Cookbook/5_2dshfl/CMakeLists.txt delete mode 100644 samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt delete mode 100644 samples/2_Cookbook/7_streams/CMakeLists.txt delete mode 100644 samples/2_Cookbook/8_peer2peer/CMakeLists.txt delete mode 100644 samples/2_Cookbook/9_unroll/CMakeLists.txt delete mode 100644 samples/README.md mode change 100755 => 100644 tests/hit/HIT.cmake delete mode 100644 tests/performance/compute/hipPerfMandelbrot.cpp delete mode 100644 tests/performance/stream/hipPerfDeviceConcurrency.cpp rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupType.cpp (97%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupTypeViaBaseType.cpp (97%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupTypeViaPublicApi.cpp (97%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupType.cpp (92%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupTypeViaBaseType.cpp (83%) rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupTypeViaPublicApi.cpp (83%) rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockType.cpp (95%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockTypeViaBaseType.cpp (94%) mode change 100755 => 100644 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockTypeViaPublicApi.cpp (94%) mode change 100755 => 100644 mode change 100755 => 100644 tests/src/p2p/hipPeerToPeer_simple.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp delete mode 100644 tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp delete mode 100644 tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp delete mode 100644 tests/src/runtimeApi/memory/hipMallocConcurrency.cpp delete mode 100644 tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp delete mode 100644 tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp rename tests/src/runtimeApi/{cooperativeGrps => module}/hipLaunchCoopMultiKernel.cpp (98%) rename tests/src/runtimeApi/{cooperativeGrps => module}/hipLaunchCooperativeKernel.cpp (94%) mode change 100755 => 100644 tests/src/test_common.h mode change 100755 => 100644 tests/unit/test_common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c7156c478..c5a49feaa3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,15 +8,10 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib ( set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -if(NOT DEFINED __HIP_ENABLE_PCH) - set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers") +if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH) + set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers") endif() -if(${__HIP_ENABLE_PCH}) - set(_pchStatus 1) -else() - set(_pchStatus 0) -endif() ############################# # Options ############################# @@ -85,8 +80,8 @@ if(GIT_FOUND) set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH}) - if(DEFINED ENV{ROCM_LIBPATCH_VERSION}) - set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION}) + if(DEFINED ENV{ROCM_BUILD_ID}) + set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH}) else() set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH}) endif() @@ -95,36 +90,6 @@ else() set(HIP_PACKAGING_VERSION_PATCH "0") endif() -## Debian package specific variables -if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) - set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) -else() - set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) -endif() -message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) - -## RPM package specific variables -if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) - set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) -else() - set ( CPACK_RPM_PACKAGE_RELEASE "local" ) -endif() - -## 'dist' breaks manual builds on debian systems due to empty Provides -execute_process( COMMAND rpm --eval %{?dist} - RESULT_VARIABLE PROC_RESULT - OUTPUT_VARIABLE EVAL_RESULT - OUTPUT_STRIP_TRAILING_WHITESPACE ) - -if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) - string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) -endif() -message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") - -add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH) -add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE) -add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE) - add_to_config(_versionInfo HIP_VERSION_MAJOR) add_to_config(_versionInfo HIP_VERSION_MINOR) add_to_config(_versionInfo HIP_VERSION_PATCH) @@ -137,6 +102,7 @@ else () set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH}) endif () set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}") + if (DEFINED ENV{ROCM_RPATH}) set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}") set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) @@ -490,7 +456,6 @@ set(_versionInfoHeader #define HIP_VERSION_MINOR ${HIP_VERSION_MINOR} #define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE} #define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n -#define __HIP_HAS_GET_PCH ${_pchStatus}\n #endif\n ") file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader}) @@ -704,11 +669,8 @@ endif() # Testing steps ############################# # Target: test -set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX}) set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}) -if(HIP_PLATFORM STREQUAL "nvcc") - execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET) -endif() execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) if(${RUN_HIT} EQUAL 0) execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) @@ -751,7 +713,7 @@ endif() ############################# # Target: clang if(HIP_HIPCC_EXECUTABLE) - add_custom_target(analyze + add_custom_target(analyze COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./ WORKING_DIRECTORY ${HIP_SRC_PATH}) if(CPPCHECK_EXE) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 750e6759c2..d9d353681d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,15 +1,15 @@ -# Contributor Guidelines +# Contributor Guidelines ## Make Tips -When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm). -This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to +When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm). +This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example ``` cmake .. -DCMAKE_INSTALL_PREFIX=.. make install -export HIP_PATH= +export HIP_PATH= ``` After making HIP, don't forget the "make install" step ! @@ -21,110 +21,118 @@ After making HIP, don't forget the "make install" step ! - Add a translation to the hipify-clang tool ; many examples abound. - For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc). - Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h. - - These are typically headers - - Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h - - Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t). + - These are typically headers + - Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h + - Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve + calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for + hipStream_t). -## Check HIP-Clang version -In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature. - -HIP runtime version +#### Testing HCC version +In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time +if the current HCC compiler is sufficiently new enough to support the desired feature. The `__hcc_workweek__` compiler +define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0) +(ie 15403, 16014, etc). +The granularity is one day, so __hcc_workweek__ can only be used to distinguish compiler builds that are at least one day apart. ``` -> cat /opt/rocm/hip/bin/.hipVersion -# Auto-generated by cmake -HIP_VERSION_MAJOR=3 -HIP_VERSION_MINOR=9 -HIP_VERSION_PATCH=20345-519ef3f2 +#ifdef __hcc_workweek_ > 16014 +// use cool new HCC feature here +#endif ``` -HIP-Clang compiler version - +Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4 ``` -$ /opt/rocm/llvm/bin/clang -v -clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432) +> /opt/rocm/hcc/bin/hcc -v +HCC clang version 3.5.0 (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn) Target: x86_64-unknown-linux-gnu Thread model: posix -InstalledDir: /opt/rocm/llvm/bin -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8 -Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9 -Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9 +Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1 +Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8 Candidate multilib: .;@m64 Candidate multilib: 32;@m32 Candidate multilib: x32;@mx32 Selected multilib: .;@m64 ``` +The unix `date` command can print the HCC-format work-week for a specific date , ie: +``` +> date --utc +%y%U%w -d 2015-11-09 +15451 +``` + ## Unit Testing Environment -HIP includes unit tests in the tests/src directory. +HIP includes unit tests in the tests/src directory. When adding a new HIP feature, add a new unit test as well. See [tests/README.md](README.md) for more information. ## Development Flow - -Directed tests provide a great place to develop new features alongside the associated test. +It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source. +Directed tests provide a great place to develop new features alongside the associated test. For applications and benchmarks outside the directed test environment, developments should use a two-step development flow: -- #1. Compile, link, and install HIP/ROCclr. See [Installation](README.md#Installation) notes. -- #2. Relink the target application to include changes in HIP runtime file. +- #1. Compile, link, and install HCC. See [Installation](README.md#Installation) notes. +- #2. Relink the target application to include changes in the libhip_hcc.a file. ## Environment Variables -- **HIP_PATH** : Location of HIP include, src, bin, lib directories. -- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms. Default /opt/rocm/rocclr. +- **HIP_PATH** : Location of HIP include, src, bin, lib directories. +- **HCC_HOME** : Path to HCC compiler. Default /opt/rocm/hcc. - **HSA_PATH** : Path to HSA include, lib. Default /opt/rocm/hsa. - **CUDA_PATH* : On nvcc system, this points to root of CUDA installation. -## Contribution guidelines ## +### Contribution guidelines ### Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs. The HIP interface is designed to be very familiar for CUDA programmers. -Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. +Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. -### Coding Guidelines (in brief) +## Coding Guidelines (in brief) - Code Indentation: - Tabs should be expanded to spaces. - Use 4 spaces indentation. - Capitalization and Naming - - Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator. + - Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator. This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational. - Member variables should begin with a leading "_". This allows them to be easily distinguished from other variables or functions. + - {} placement - For functions, the opening { should be placed on a new line. - For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else. Example ''' if (foo) { - doFoo() - } else { + doFoo() + } else { doFooElse(); } ''' - namespace should be on same line as { and separated by a space. - Single-line if statement should still use {/} pair (even though C++ does not require). - Miscellaneous - - All references in function parameter lists should be const. + - All references in function parameter lists should be const. - "ihip" = internal hip structures. These should not be exposed through the HIP API. - Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs. - FIXME refers to a short-term bug that needs to be addressed. - HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized, and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match - those of the parent function. -- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code + those of the parent function. +- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly. - All HIP environment variables should begin with the keyword HIP_ Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores. To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform . - HIPCC or other tools may support additional environment variables which should follow the above convention. + HIPCC or other tools may support additional environment variables which should follow the above convention. -### Presubmit Testing: -Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests. + +#### Presubmit Testing: +Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests. Ensure pass results match starting point: ```shell @@ -133,13 +141,13 @@ Ensure pass results match starting point: ``` -### Checkin messages +#### Checkin messages Follow existing best practice for writing a good Git commit message. Some tips: http://chris.beams.io/posts/git-commit/ https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message -In particular : - - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc". +In particular : + - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc". Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc. - Subject should summarize the commit. Do not end subject with a period. Use a blank line after the subject. diff --git a/bin/hip_embed_pch.sh b/bin/hip_embed_pch.sh index 0f2cbabd84..8fe3c20f98 100755 --- a/bin/hip_embed_pch.sh +++ b/bin/hip_embed_pch.sh @@ -1,7 +1,8 @@ #!/bin/bash #set -x -LLVM_DIR="$1/../../../" + +ROCM_PATH=${ROCM_PATH:-/opt/rocm} tmp=/tmp/hip_pch.$$ mkdir -p $tmp @@ -46,12 +47,12 @@ __hip_pch_size: .long __hip_pch_size - __hip_pch EOF -$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui +$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui cat $tmp/hip_macros.h >> $tmp/pch.cui -$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui +$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui -$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj +$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj rm -rf $tmp diff --git a/bin/hip_gen_pch.sh b/bin/hip_gen_pch.sh new file mode 100755 index 0000000000..b212177119 --- /dev/null +++ b/bin/hip_gen_pch.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +#set -x + +cat >/tmp/hip_macros.h </tmp/hip_pch.h </tmp/pch.cui + +cat /tmp/hip_macros.h >> /tmp/pch.cui + +/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - (tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; - #else - static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); - return static_cast(__shfl(static_cast(var), src_lane, width)); - #endif -} -__device__ -inline long long __shfl(long long var, int src_lane, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -397,22 +378,8 @@ long long __shfl(long long var, int src_lane, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } -__device__ -inline -unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) { - static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl(tmp[0], src_lane, width); - tmp[1] = __shfl(tmp[1], src_lane, width); - - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; -} - -__device__ + __device__ inline int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { int self = __lane_id(); @@ -468,28 +435,6 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) return static_cast(__shfl_up(static_cast(var), lane_delta, width)); #endif } - -__device__ -inline -unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize) -{ - #ifndef _MSC_VER - static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); - - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_up(tmp[0], lane_delta, width); - tmp[1] = __shfl_up(tmp[1], lane_delta, width); - - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; - #else - static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); - return static_cast(__shfl_up(static_cast(var), lane_delta, width)); - #endif -} - __device__ inline long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize) @@ -504,20 +449,6 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize return tmp1; } -__device__ -inline -unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize) -{ - static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_up(tmp[0], lane_delta, width); - tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; -} - __device__ inline int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { @@ -576,26 +507,6 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) } __device__ inline -unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize) -{ - #ifndef _MSC_VER - static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); - - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_down(tmp[0], lane_delta, width); - tmp[1] = __shfl_down(tmp[1], lane_delta, width); - - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; - #else - static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); - return static_cast(__shfl_down(static_cast(var), lane_delta, width)); - #endif -} -__device__ -inline long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -607,19 +518,6 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } -__device__ -inline -unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize) -{ - static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_down(tmp[0], lane_delta, width); - tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; -} __device__ inline @@ -679,26 +577,6 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize) } __device__ inline -unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize) -{ - #ifndef _MSC_VER - static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); - - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_xor(tmp[0], lane_mask, width); - tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; - #else - static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); - return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); - #endif -} -__device__ -inline long long __shfl_xor(long long var, int lane_mask, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -710,19 +588,7 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } -__device__ -inline -unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize) -{ - static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); - tmp[0] = __shfl_xor(tmp[0], lane_mask, width); - tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); - unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); - return tmp1; -} + #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 0a173bb466..0e5820a016 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -487,22 +487,6 @@ struct __HIP_Coordinates { #endif }; -template -#if !defined(_MSC_VER) -__attribute__((weak)) -#endif -constexpr typename __HIP_Coordinates::X __HIP_Coordinates::x; -template -#if !defined(_MSC_VER) -__attribute__((weak)) -#endif -constexpr typename __HIP_Coordinates::Y __HIP_Coordinates::y; -template -#if !defined(_MSC_VER) -__attribute__((weak)) -#endif -constexpr typename __HIP_Coordinates::Z __HIP_Coordinates::z; - extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint); inline __device__ diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 74c0fb5f69..37fcccf192 100755 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -345,16 +345,13 @@ typedef struct hipLaunchParams_t { hipStream_t stream; ///< Stream identifier } hipLaunchParams; -#if __HIP_HAS_GET_PCH -/** - * Internal use only. This API may change in the future - * Pre-Compiled header for online compilation - * - */ - void __hipGetPCH(const char** pch, unsigned int*size); +// Pre-Compiled header for online compilation +#ifdef ENABLE_HIP_PCH +extern const char* __hip_pch; +extern unsigned __hip_pch_size; +void __hipGetPCH(const char** pch, unsigned int*size); #endif - // Doxygen end group GlobalDefs /** @} */ diff --git a/include/hip/hip_cooperative_groups.h b/include/hip/hip_cooperative_groups.h index 41f36378bb..d919e83c7f 100644 --- a/include/hip/hip_cooperative_groups.h +++ b/include/hip/hip_cooperative_groups.h @@ -28,17 +28,14 @@ THE SOFTWARE. */ #ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H -#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H - -#include -#include +#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H #if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) -#if __cplusplus && defined(__clang__) && defined(__HIP__) +#if __cplusplus #include #endif #elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__) -#include +#include #else #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__"); #endif diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 3a26fb74f4..4412bbd7da 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -32,7 +32,6 @@ THE SOFTWARE. #include // for getDeviceProp -#include #include enum { diff --git a/include/hip/nvcc_detail/hip_cooperative_groups.h b/include/hip/nvcc_detail/hip_cooperative_groups.h deleted file mode 100644 index 113e600eec..0000000000 --- a/include/hip/nvcc_detail/hip_cooperative_groups.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H -#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H - -// Include CUDA headers -#include -#include - -// Include HIP wrapper headers around CUDA -#include -#include - -#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h index e7c3eaf32a..c13540df54 100644 --- a/include/hip/nvcc_detail/hip_runtime.h +++ b/include/hip/nvcc_detail/hip_runtime.h @@ -104,13 +104,13 @@ typedef int hipLaunchParm; #define HIP_DYNAMIC_SHARED_ATTRIBUTE #ifdef __HIP_DEVICE_COMPILE__ -#define abort_() \ +#define abort() \ { asm("trap;"); } #undef assert #define assert(COND) \ { \ if (!COND) { \ - abort_(); \ + abort(); \ } \ } #endif diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index ce1469804e..faa0bf7d7b 100755 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -26,7 +26,6 @@ THE SOFTWARE. #include #include #include -#include #ifdef __cplusplus extern "C" { diff --git a/lpl_ca/CMakeLists.txt b/lpl_ca/CMakeLists.txt index 2473fbc254..c272273c09 100644 --- a/lpl_ca/CMakeLists.txt +++ b/lpl_ca/CMakeLists.txt @@ -20,7 +20,6 @@ target_include_directories(lpl target_compile_options(lpl PUBLIC -Wall) target_link_libraries(lpl PUBLIC pthread) -add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl) install(TARGETS lpl RUNTIME DESTINATION bin) #-------------------------------------LPL--------------------------------------# @@ -44,7 +43,6 @@ find_package(hsa-runtime64 REQUIRED CONFIG target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 ) target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall) -add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca) install(TARGETS ca RUNTIME DESTINATION bin) #-------------------------------------CA---------------------------------------# diff --git a/packaging/hip-base.txt b/packaging/hip-base.txt index 9b10ec2c3f..7ba7d3b93a 100644 --- a/packaging/hip-base.txt +++ b/packaging/hip-base.txt @@ -21,23 +21,22 @@ set(CPACK_PACKAGE_NAME "hip-base") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) -set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") +set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") diff --git a/packaging/hip-doc.txt b/packaging/hip-doc.txt index 30f05cb6e6..911f2486fd 100644 --- a/packaging/hip-doc.txt +++ b/packaging/hip-doc.txt @@ -24,26 +24,25 @@ set(CPACK_PACKAGE_NAME "hip-doc") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) -set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})") +set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") +set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/packaging/hip-hcc.txt b/packaging/hip-hcc.txt index a17bd8ca86..d084e8d966 100644 --- a/packaging/hip-hcc.txt +++ b/packaging/hip-hcc.txt @@ -28,29 +28,24 @@ endif() set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") - set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc") - set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/packaging/hip-nvcc.txt b/packaging/hip-nvcc.txt index f5d43533dc..5d3d91ffb6 100644 --- a/packaging/hip-nvcc.txt +++ b/packaging/hip-nvcc.txt @@ -10,29 +10,28 @@ set(CPACK_PACKAGE_NAME "hip-nvcc") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") +set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/packaging/hip-rocclr.txt b/packaging/hip-rocclr.txt index ee5ec0c3db..6f5c16bb96 100644 --- a/packaging/hip-rocclr.txt +++ b/packaging/hip-rocclr.txt @@ -33,28 +33,27 @@ set(HCC_PACKAGE_NAME "rocclr") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), comgr (>= 1.1), llvm-amdgpu") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") +set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, comgr >= 1.1, llvm-amdgpu") +set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") diff --git a/packaging/hip-samples.txt b/packaging/hip-samples.txt index 34f0dddd2e..6481cf7bde 100644 --- a/packaging/hip-samples.txt +++ b/packaging/hip-samples.txt @@ -12,26 +12,25 @@ set(CPACK_PACKAGE_NAME "hip-samples") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) -set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@) -set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})") +set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@) -set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") +set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm) set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}") +set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/rocclr/CMakeLists.txt b/rocclr/CMakeLists.txt index ec1dc50407..187edab746 100755 --- a/rocclr/CMakeLists.txt +++ b/rocclr/CMakeLists.txt @@ -96,14 +96,6 @@ find_package(amd_comgr REQUIRED CONFIG message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") -find_package(LLVM REQUIRED CONFIG - PATHS - /opt/rocm/llvm - PATH_SUFFIXES - lib/cmake/llvm) - -message(STATUS "llvm found at ${LLVM_DIR}.") - add_library(hip64 OBJECT hip_context.cpp hip_code_object.cpp @@ -156,9 +148,10 @@ endif() # Short-Term solution for pre-compiled headers for online compilation # Enable pre compiled header -if(${__HIP_ENABLE_PCH}) - execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}") - add_definitions(-D__HIP_ENABLE_PCH) +if(${ENABLE_HIP_PCH}) + execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh") + execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh") + add_definitions(-DENABLE_HIP_PCH) endif() # Enable profiling API @@ -223,7 +216,7 @@ add_library(device INTERFACE) target_link_libraries(device INTERFACE host) # Short-Term solution for pre-compiled headers for online compilation -if(${__HIP_ENABLE_PCH}) +if(${ENABLE_HIP_PCH}) target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o) endif() @@ -234,18 +227,6 @@ endif() # filename. if(${BUILD_SHARED_LIBS}) target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64) - - add_custom_command(TARGET amdhip64 POST_BUILD COMMAND - ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING} - ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}) - add_custom_command(TARGET amdhip64 POST_BUILD COMMAND - ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR} - ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so) - add_custom_command(TARGET amdhip64 POST_BUILD COMMAND - ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo) - add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory - ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include) - INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) else() target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr) @@ -263,7 +244,6 @@ else() INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) endif() - INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::) diff --git a/rocclr/hip_code_object.cpp b/rocclr/hip_code_object.cpp index b0979c5246..c6a866c9c4 100755 --- a/rocclr/hip_code_object.cpp +++ b/rocclr/hip_code_object.cpp @@ -202,10 +202,19 @@ hipError_t DynCO::populateDynGlobalVars() { return hipErrorSharedObjectSymbolNotFound; } + if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) { + DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module()); + return hipErrorSharedObjectSymbolNotFound; + } + for (auto& elem : var_names) { vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr))); } + for (auto& elem : undef_var_names) { + vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr))); + } + return hipSuccess; } @@ -368,4 +377,20 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice *size_ptr = dvar->size(); return hipSuccess; } + +hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, + hipDeviceptr_t* dev_ptr, size_t* size_ptr) { + amd::ScopedLock lock(sclock_); + + for (auto& elem : vars_) { + if ((elem.second->name() == hostVar) + && (elem.second->module(deviceId) == hmod)) { + *dev_ptr = elem.second->device_ptr(deviceId); + *size_ptr = elem.second->device_size(deviceId); + return hipSuccess; + } + } + + return hipErrorNotFound; +} }; //namespace: hip diff --git a/rocclr/hip_code_object.hpp b/rocclr/hip_code_object.hpp index 0cc2a7051a..f5f179570b 100755 --- a/rocclr/hip_code_object.hpp +++ b/rocclr/hip_code_object.hpp @@ -118,6 +118,8 @@ public: hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId); hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr, size_t* size_ptr); + hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, + hipDeviceptr_t* dev_ptr, size_t* size_ptr); private: friend class ::PlatformState; diff --git a/rocclr/hip_device.cpp b/rocclr/hip_device.cpp index c0dbc89970..70548d5328 100644 --- a/rocclr/hip_device.cpp +++ b/rocclr/hip_device.cpp @@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) ::strncpy(deviceProps.name, info.boardName_, 128); deviceProps.totalGlobalMem = info.globalMemSize_; deviceProps.sharedMemPerBlock = info.localMemSizePerCU_; - deviceProps.regsPerBlock = info.availableRegistersPerCU_; + deviceProps.regsPerBlock = info.availableSGPRs_; deviceProps.warpSize = info.wavefrontWidth_; deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_; deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0]; diff --git a/rocclr/hip_fatbin.cpp b/rocclr/hip_fatbin.cpp index 8072c18b36..95a91063a2 100755 --- a/rocclr/hip_fatbin.cpp +++ b/rocclr/hip_fatbin.cpp @@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() { } FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) - : fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) { + : fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) { guarantee(fname || image); if (fname != nullptr) { @@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() { } fname_ = std::string(); - fdesc_ = amd::Os::FDescInit(); + fdesc_ = -1; fsize_ = 0; image_ = nullptr; uri_ = std::string(); @@ -64,9 +64,6 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector& devi if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) { return hipErrorFileNotFound; } - if (fsize_ == 0) { - return hipErrorInvalidKernelFile; - } // Extract the code object from file hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, diff --git a/rocclr/hip_global.cpp b/rocclr/hip_global.cpp index 46e6efcf52..bed2dcd850 100755 --- a/rocclr/hip_global.cpp +++ b/rocclr/hip_global.cpp @@ -5,9 +5,7 @@ #include "hip_code_object.hpp" #include "platform/program.hpp" -#ifdef __HIP_ENABLE_PCH -extern const char __hip_pch[]; -extern unsigned __hip_pch_size; +#ifdef ENABLE_HIP_PCH void __hipGetPCH(const char** pch, unsigned int *size) { *pch = __hip_pch; *size = __hip_pch_size; diff --git a/rocclr/hip_global.hpp b/rocclr/hip_global.hpp index fd57ecfb50..3888daf30b 100755 --- a/rocclr/hip_global.hpp +++ b/rocclr/hip_global.hpp @@ -95,6 +95,11 @@ public: hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId); void resize_dVar(size_t size) { dVar_.resize(size); } + //Accessor for device_ptrs. + std::string name() const { return name_; } + hipModule_t module(int deviceId) const { return nullptr; } + hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); } + size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); } FatBinaryInfo** moduleInfo() { return modules_; }; private: diff --git a/rocclr/hip_internal.hpp b/rocclr/hip_internal.hpp index 7e0cc8b9a2..a950961ea7 100755 --- a/rocclr/hip_internal.hpp +++ b/rocclr/hip_internal.hpp @@ -252,6 +252,8 @@ extern int ihipGetDevice(); extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags); extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset); extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size); +extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr, + size_t* var_size); constexpr bool kOptionChangeable = true; constexpr bool kNewDevProg = false; diff --git a/rocclr/hip_memory.cpp b/rocclr/hip_memory.cpp index 8fd9b05cdb..b0e1d6abdd 100755 --- a/rocclr/hip_memory.cpp +++ b/rocclr/hip_memory.cpp @@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if (*ptr == nullptr) { size_t free = 0, total =0; hipMemGetInfo(&free, &total); - LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total); + LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total); return hipErrorOutOfMemory; } @@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin } } else { amd::HostQueue* pQueue = &queue; - if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) && - (queueDevice != srcMemory->getContext().devices()[0])) { + if (queueDevice != srcMemory->getContext().devices()[0]) { pQueue = hip::getNullStream(srcMemory->getContext()); amd::Command* cmd = queue.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } + command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); } @@ -1850,27 +1850,18 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, hipExtent extent, hipStream_t stream, bool isAsync = false) { - size_t offset = 0; - amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset); - - auto sizeBytes = extent.width * extent.height * extent.depth; - - if (memory == nullptr) { - return hipErrorInvalidValue; - } - if (sizeBytes > memory->getSize()) { - return hipErrorInvalidValue; - } - if (pitchedDevPtr.pitch == extent.width) { - return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast(sizeBytes), stream, isAsync); + return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync); } - // Workaround for cases when pitch > row until fill kernel will be updated to support pitch. - // Fall back to filling one row at a time. + // Workaround for cases when pitch > row untill fill kernel will be updated to support pitch. + // Fallback to filling one row at a time. amd::HostQueue* queue = hip::getQueue(stream); + size_t offset = 0; + amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset); + amd::Coord3D origin(offset); amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth); amd::BufferRect rect; @@ -1879,26 +1870,34 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, return hipErrorInvalidValue; } - std::vector commands; + if (memory != nullptr) { + std::vector commands; - for (size_t slice = 0; slice < extent.depth; slice++) { - for (size_t row = 0; row < extent.height; row++) { - const size_t rowOffset = rect.offset(0, row, slice); - amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue, - CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { }, - *memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset, - 0, 0 }, amd::Coord3D { extent.width, 1, 1 }); + for (size_t slice = 0; slice < extent.depth; slice++) { + for (size_t row = 0; row < extent.height; row++) { + const size_t rowOffset = rect.offset(0, row, slice); + amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue, + CL_COMMAND_FILL_BUFFER, + amd::Command::EventWaitList{}, + *memory->asBuffer(), + &value, + sizeof(int8_t), + amd::Coord3D{rowOffset, 0, 0}, + amd::Coord3D{extent.width, 1, 1}); - command->enqueue(); - commands.push_back(command); + command->enqueue(); + commands.push_back(command); + } } - } - for (auto &command : commands) { - if (!isAsync) { - command->awaitCompletion(); + for (auto &command: commands) { + if (!isAsync) { + command->awaitCompletion(); + } + command->release(); } - command->release(); + } else { + return hipErrorInvalidValue; } return hipSuccess; @@ -2039,7 +2038,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void memset(attributes, 0, sizeof(hipPointerAttribute_t)); if (memObj != nullptr) { - attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; + attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; if (attributes->memoryType == hipMemoryTypeHost) { attributes->hostPointer = static_cast(memObj->getSvmPtr()) + offset; } diff --git a/rocclr/hip_module.cpp b/rocclr/hip_module.cpp index b72ee1a5a2..4a09cc6ed0 100755 --- a/rocclr/hip_module.cpp +++ b/rocclr/hip_module.cpp @@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL if (result != hipSuccess) { break; } - prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ; + prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } // Sync the execution streams on all devices diff --git a/rocclr/hip_peer.cpp b/rocclr/hip_peer.cpp index fe22803c33..ded6843957 100755 --- a/rocclr/hip_peer.cpp +++ b/rocclr/hip_peer.cpp @@ -97,10 +97,6 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount); - if (linktype == nullptr || hopcount == nullptr || - device1 == device2 || device1 < 0 || device2 < 0) { - HIP_RETURN(hipErrorInvalidValue); - } // Fill out the list of LinkAttributes std::vector link_attrs; link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0)); diff --git a/rocclr/hip_platform.cpp b/rocclr/hip_platform.cpp index 6abea0df4e..6e6f08bf44 100755 --- a/rocclr/hip_platform.cpp +++ b/rocclr/hip_platform.cpp @@ -80,6 +80,27 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data) return PlatformState::instance().addFatBinary(fbwrapper->binary); } +bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod, + void** var_addr, size_t* var_size) { + + amd::ScopedLock lock(lock_); + if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) { + return true; + } + + if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) { + return true; + } + + return false; +} + +bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr, + size_t* var_size) { + return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast(program), + var_addr, var_size); +} + extern "C" void __hipRegisterFunction( hip::FatBinaryInfo** modules, const void* hostFunction, @@ -665,19 +686,11 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept { return s | v; } -extern "C" -#if !defined(_MSC_VER) -__attribute__((weak)) -#endif -float __gnu_h2f_ieee(unsigned short h){ +extern "C" __attribute__((weak)) float __gnu_h2f_ieee(unsigned short h){ return __convert_half_to_float((std::uint32_t) h); } -extern "C" -#if !defined(_MSC_VER) -__attribute__((weak)) -#endif -unsigned short __gnu_f2h_ieee(float f){ +extern "C" __attribute__((weak)) unsigned short __gnu_f2h_ieee(float f){ return (unsigned short)__convert_float_to_half(f); } @@ -752,9 +765,6 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod, DevLogPrintfError("Cannot find the module: 0x%x", hmod); return hipErrorNotFound; } - if (0 == strlen(func_name)) { - return hipErrorNotFound; - } return it->second->getDynFunc(hfunc, func_name); } @@ -858,6 +868,11 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr); } +hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, + hipDeviceptr_t* dev_ptr, size_t* size_ptr) { + return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr); +} + void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) { auto& arguments = execStack_.top().arguments_; diff --git a/rocclr/hip_platform.hpp b/rocclr/hip_platform.hpp index 51fea0841e..2bcf620f6d 100755 --- a/rocclr/hip_platform.hpp +++ b/rocclr/hip_platform.hpp @@ -77,6 +77,11 @@ public: hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId); hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr, size_t* size_ptr); + hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod, + hipDeviceptr_t* dev_ptr, size_t* size_ptr); + + bool getShadowVarInfo(std::string var_name, hipModule_t hmod, + void** var_addr, size_t* var_size); //Exec Functions void setupArgument(const void *arg, size_t size, size_t offset); diff --git a/samples/0_Intro/bit_extract/CMakeLists.txt b/samples/0_Intro/bit_extract/CMakeLists.txt deleted file mode 100644 index c9b13be812..0000000000 --- a/samples/0_Intro/bit_extract/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(bit_extract) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) - -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(bit_extract bit_extract.cpp) - -# Link with HIP -target_link_libraries(bit_extract hip::host) \ No newline at end of file diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 3427815ffc..4a3a0bb4fe 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -9,15 +9,19 @@ HIPCC=$(HIP_PATH)/bin/hipcc # Show how to use PLATFORM to specify different options for each compiler: ifeq (${HIP_PLATFORM}, nvcc) - HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 + HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif EXE=bit_extract +EXE_STATIC=bit_extract_static $(EXE): bit_extract.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ -all: $(EXE) +$(EXE_STATIC): bit_extract.cpp + $(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@ + +all: $(EXE) $(EXE_STATIC) clean: - rm -f *.o $(EXE) + rm -f *.o $(EXE) $(EXE_STATIC) diff --git a/samples/0_Intro/module_api/CMakeLists.txt b/samples/0_Intro/module_api/CMakeLists.txt deleted file mode 100644 index 0f5cc32f91..0000000000 --- a/samples/0_Intro/module_api/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -project(module_api) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) - -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(runKernel.hip.out runKernel.cpp) -add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp) -add_executable(defaultDriver.hip.out defaultDriver.cpp) - -# Generate code object -add_custom_target( - codeobj - ALL - COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code - COMMENT "codeobj generated" -) - -add_dependencies(runKernel.hip.out codeobj) -add_dependencies(launchKernelHcc.hip.out codeobj) -add_dependencies(defaultDriver.hip.out codeobj) - -# Link with HIP -target_link_libraries(runKernel.hip.out hip::host) -target_link_libraries(launchKernelHcc.hip.out hip::host) -target_link_libraries(defaultDriver.hip.out hip::host) diff --git a/samples/0_Intro/module_api_global/CMakeLists.txt b/samples/0_Intro/module_api_global/CMakeLists.txt deleted file mode 100644 index 00caa79cfa..0000000000 --- a/samples/0_Intro/module_api_global/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -project(modile_api_global) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) - -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(runKernel.hip.out runKernel.cpp) - -# Generate code object -add_custom_target( - codeobj - ALL - COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code - COMMENT "codeobj generated" -) - -add_dependencies(runKernel.hip.out codeobj) - -# Link with HIP -target_link_libraries(runKernel.hip.out hip::host) \ No newline at end of file diff --git a/samples/0_Intro/square/CMakeLists.txt b/samples/0_Intro/square/CMakeLists.txt deleted file mode 100644 index 845c43fd1f..0000000000 --- a/samples/0_Intro/square/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -#Follow "README.md" to generate square.cpp if it's missing - -project(square) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(square square.cpp) - -# Link with HIP -target_link_libraries(square hip::host) \ No newline at end of file diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 9bb0dd8205..aa046eeaaa 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -11,7 +11,7 @@ else SOURCES=square.cpp endif -all: square.out +all: square.out square.out.static # Step square.cpp: square.cu @@ -20,5 +20,8 @@ square.cpp: square.cu square.out: $(SOURCES) $(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@ +square.out.static: $(SOURCES) + $(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@ + clean: - rm -f *.o *.out square.cpp + rm -f *.o *.out *.out.static square.cpp diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md index 0bbb2f7e39..c185903993 100644 --- a/samples/0_Intro/square/README.md +++ b/samples/0_Intro/square/README.md @@ -1,39 +1,13 @@ # Square.md -Simple test which shows how to use hipify-perl to port CUDA code to HIP. -See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. +Simple test which shows how to use hipify-perl to port CUDA code to HIP. +See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile: -- Add hip/bin path to the PATH +1. Add hip/bin path to the PATH : + export PATH=$PATH:[MYHIP]/bin -``` -$ export PATH=$PATH:[MYHIP]/bin -``` - -- Define environment variable - -``` -$ export HIP_PATH=[MYHIP] -``` - -- Build executible file - -``` -$ cd ~/hip/samples/0_Intro/square -$ make -/home/user/hip/bin/hipify-perl square.cu > square.cpp -/home/user/hip/bin/hipcc square.cpp -o square.out -/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static -``` -- Execute file -``` -$ ./square.out -info: running on device Navi 14 [Radeon Pro W5500] -info: allocate host mem ( 7.63 MB) -info: allocate device mem ( 7.63 MB) -info: copy Host2Device -info: launch 'vector_square' kernel -info: copy Device2Host -info: check result -PASSED! -``` +2. $ make + Make runs these steps. This can be performed on either CUDA or AMD platform: + hipify-perl square.cu > square.cpp # convert cuda code to hip code + hipcc square.cpp # compile into executable diff --git a/samples/1_Utils/hipBusBandwidth/CMakeLists.txt b/samples/1_Utils/hipBusBandwidth/CMakeLists.txt deleted file mode 100644 index df01c31d97..0000000000 --- a/samples/1_Utils/hipBusBandwidth/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(hipBusBandwidth) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp) - -# Link with HIP -target_link_libraries(hipBusBandwidth hip::host) \ No newline at end of file diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 8032bd0a20..6181c49afe 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered }; bool p_verbose = false; MallocMode p_malloc_mode = MallocPinned; int p_numa_ctl = -1; -int p_iterations = 0; +int p_iterations = 10; int p_beatsperiteration = 1; int p_device = 0; int p_detailed = 0; @@ -89,9 +89,7 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288}; int nSizes = sizeof(sizes) / sizeof(int); -// iterations to be run for the corresponding sizes, less number as the size increases -int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200, - 200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20}; + // **************************************************************************** // Function: RunBenchmark_H2D @@ -176,48 +174,53 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // store the times temporarily to estimate latency - // float times[nSizes]; - for (int i = 0; i < nSizes; i++) { - int sizeIndex, iterIndex; - sizeIndex = i; - iterIndex = i; + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) { + // store the times temporarily to estimate latency + // float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); - const int niter = p_iterations ? p_iterations : iterations[iterIndex]; - for (int pass = 0; pass < niter; pass++) { + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); + } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, + "ms", t); + + if (p_onesize) { + break; + } } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t); - - } - if (p_onesize) { - break; - } } if (p_onesize) { @@ -344,50 +347,53 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // store the times temporarily to estimate latency - // float times[nSizes]; - for (int i = 0; i < nSizes; i++) { - int sizeIndex, iterIndex; - sizeIndex = i; - iterIndex = i; + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) { + // store the times temporarily to estimate latency + // float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); - const int niter = p_iterations ? p_iterations : iterations[iterIndex]; - for (int pass = 0; pass < niter; pass++) { + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); + } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); + if (p_onesize) { + break; + } } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); - - } - if (p_onesize) { - break; - } } if (p_onesize) { @@ -516,43 +522,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // store the times temporarily to estimate latency - // float times[nSizes]; - for (int i = 0; i < nSizes; i++) { - int sizeIndex, iterIndex; - sizeIndex = i; - iterIndex = i; + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) { + // store the times temporarily to estimate latency + // float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); - const int niter = p_iterations ? p_iterations : iterations[iterIndex]; - for (int pass = 0; pass < niter; pass++) { + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); - hipEventRecord(start, 0); - hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); - hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); + hipEventRecord(start, 0); + hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); + hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + resultDB.AddResult( + std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, + "GB/sec", speed); + resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); } - - double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - resultDB.AddResult( - std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, - "GB/sec", speed); - resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); - } - if (p_onesize) { - break; - } } // Cleanup @@ -702,63 +708,66 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // store the times temporarily to estimate latency - // float times[nSizes]; - for (int i = 0; i < nSizes; i++) { - int sizeIndex, iterIndex; - sizeIndex = i; - iterIndex = i; + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) { + // store the times temporarily to estimate latency + // float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); - const int niter = p_iterations ? p_iterations : iterations[iterIndex]; - for (int pass = 0; pass < niter; pass++) { + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); - } + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); + } - hipEventRecord(stop, 0); + hipEventRecord(stop, 0); - hipEventSynchronize(stop); + hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); - resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "ms", t); - } - if (p_onesize) { - break; + if (p_onesize) { + break; + } } } @@ -820,68 +829,71 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // store the times temporarily to estimate latency - // float times[nSizes]; - for (int i = 0; i < nSizes; i++) { - int sizeIndex, iterIndex; - sizeIndex = i; - iterIndex = i; + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) { + // store the times temporarily to estimate latency + // float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); - const int niter = p_iterations ? p_iterations : iterations[iterIndex]; - for (int pass = 0; pass < niter; pass++) { + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, - hipMemcpyDeviceToDevice, stream[0]); - hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, - hipMemcpyDeviceToDevice, stream[1]); + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, + hipMemcpyDeviceToDevice, stream[0]); + hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, + hipMemcpyDeviceToDevice, stream[1]); + } + + hipEventRecord(stop, 0); + + hipEventSynchronize(stop); + + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / + t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "ms", t); + + if (p_onesize) { + break; + } } - - hipEventRecord(stop, 0); - - hipEventSynchronize(stop); - - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / - t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); - - resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "ms", t); - - } - if (p_onesize) { - break; - } } if (p_onesize) { diff --git a/samples/1_Utils/hipCommander/CMakeLists.txt b/samples/1_Utils/hipCommander/CMakeLists.txt deleted file mode 100644 index 2592020c66..0000000000 --- a/samples/1_Utils/hipCommander/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -project(hipCommander) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(hipCommander hipCommander.cpp) - -# Generate code object -add_custom_target( - codeobj - ALL - COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco - COMMENT "codeobj generated" -) - -add_dependencies(hipCommander codeobj) - -# Link with HIP -target_link_libraries(hipCommander hip::host) -set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11) diff --git a/samples/1_Utils/hipDispatchLatency/CMakeLists.txt b/samples/1_Utils/hipDispatchLatency/CMakeLists.txt deleted file mode 100644 index b267f91905..0000000000 --- a/samples/1_Utils/hipDispatchLatency/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -project(hipDispatchLatency) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(hipDispatchLatency hipDispatchLatency.cpp) -add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp) - -# Generate code object -add_custom_target( - codeobj - ALL - COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code - COMMENT "codeobj generated" -) - -add_dependencies(hipDispatchLatency codeobj) -add_dependencies(hipDispatchEnqueueRateMT codeobj) - -# Link with HIP -target_link_libraries(hipDispatchLatency hip::host) -target_link_libraries(hipDispatchEnqueueRateMT hip::host) -set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11) -set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11) diff --git a/samples/1_Utils/hipInfo/CMakeLists.txt b/samples/1_Utils/hipInfo/CMakeLists.txt deleted file mode 100644 index f3678d3160..0000000000 --- a/samples/1_Utils/hipInfo/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(hipInfo) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(hipInfo hipInfo.cpp) - -# Link with HIP -target_link_libraries(hipInfo hip::host) diff --git a/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt b/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt deleted file mode 100644 index de5bb0b5ea..0000000000 --- a/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(MatrixTranspose) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(MatrixTranspose MatrixTranspose.cpp) - -# Link with HIP -target_link_libraries(MatrixTranspose hip::host) diff --git a/samples/2_Cookbook/10_inline_asm/CMakeLists.txt b/samples/2_Cookbook/10_inline_asm/CMakeLists.txt deleted file mode 100644 index 7adb51f5de..0000000000 --- a/samples/2_Cookbook/10_inline_asm/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(inline_asm) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(inline_asm inline_asm.cpp) - -# Link with HIP -target_link_libraries(inline_asm hip::host) diff --git a/samples/2_Cookbook/11_texture_driver/CMakeLists.txt b/samples/2_Cookbook/11_texture_driver/CMakeLists.txt deleted file mode 100644 index 8ff242c993..0000000000 --- a/samples/2_Cookbook/11_texture_driver/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -project(texture2dDrv) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(texture2dDrv texture2dDrv.cpp) - -# Generate code object -add_custom_target( - codeobj - ALL - COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code - COMMENT "codeobj generated" -) - -add_dependencies(texture2dDrv codeobj) - -# Link with HIP -target_link_libraries(texture2dDrv hip::host) diff --git a/samples/2_Cookbook/13_occupancy/CMakeLists.txt b/samples/2_Cookbook/13_occupancy/CMakeLists.txt deleted file mode 100644 index 6cad76a395..0000000000 --- a/samples/2_Cookbook/13_occupancy/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(occupancy) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(occupancy occupancy.cpp) - -# Link with HIP -target_link_libraries(occupancy hip::host) diff --git a/samples/2_Cookbook/1_hipEvent/CMakeLists.txt b/samples/2_Cookbook/1_hipEvent/CMakeLists.txt deleted file mode 100644 index 6f6ee4e050..0000000000 --- a/samples/2_Cookbook/1_hipEvent/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(hipEvent) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(hipEvent hipEvent.cpp) - -# Link with HIP -target_link_libraries(hipEvent hip::host) diff --git a/samples/2_Cookbook/3_shared_memory/CMakeLists.txt b/samples/2_Cookbook/3_shared_memory/CMakeLists.txt deleted file mode 100644 index 6401488628..0000000000 --- a/samples/2_Cookbook/3_shared_memory/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(sharedMemory) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(sharedMemory sharedMemory.cpp) - -# Link with HIP -target_link_libraries(sharedMemory hip::host) diff --git a/samples/2_Cookbook/4_shfl/CMakeLists.txt b/samples/2_Cookbook/4_shfl/CMakeLists.txt deleted file mode 100644 index 9d142eeb02..0000000000 --- a/samples/2_Cookbook/4_shfl/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -project(shfl) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_BUILD_TYPE Release) - -# Create the excutable -add_executable(shfl shfl.cpp) - -# Link with HIP -target_link_libraries(shfl hip::host) diff --git a/samples/2_Cookbook/5_2dshfl/CMakeLists.txt b/samples/2_Cookbook/5_2dshfl/CMakeLists.txt deleted file mode 100644 index adc0e3595d..0000000000 --- a/samples/2_Cookbook/5_2dshfl/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -project(2dshfl) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(2dshfl 2dshfl.cpp) - -# Link with HIP -target_link_libraries(2dshfl hip::host) diff --git a/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt b/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt deleted file mode 100644 index f177952d5a..0000000000 --- a/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -project(dynamic_shared) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(dynamic_shared dynamic_shared.cpp) - -# Link with HIP -target_link_libraries(dynamic_shared hip::host) diff --git a/samples/2_Cookbook/7_streams/CMakeLists.txt b/samples/2_Cookbook/7_streams/CMakeLists.txt deleted file mode 100644 index fac4187b47..0000000000 --- a/samples/2_Cookbook/7_streams/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -project(stream) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(stream stream.cpp) - -# Link with HIP -target_link_libraries(stream hip::host) diff --git a/samples/2_Cookbook/8_peer2peer/CMakeLists.txt b/samples/2_Cookbook/8_peer2peer/CMakeLists.txt deleted file mode 100644 index 7c38373911..0000000000 --- a/samples/2_Cookbook/8_peer2peer/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -project(peer2peer) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(peer2peer peer2peer.cpp) - -# Link with HIP -target_link_libraries(peer2peer hip::host) diff --git a/samples/2_Cookbook/9_unroll/CMakeLists.txt b/samples/2_Cookbook/9_unroll/CMakeLists.txt deleted file mode 100644 index fc1b740e33..0000000000 --- a/samples/2_Cookbook/9_unroll/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -project(unroll) - -cmake_minimum_required(VERSION 3.10) - -# Search for rocm in common locations -list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) - -# Find hip -find_package(hip) - -# Set compiler and linker -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -# Create the excutable -add_executable(unroll unroll.cpp) - -# Link with HIP -target_link_libraries(unroll hip::host) diff --git a/samples/README.md b/samples/README.md deleted file mode 100644 index 739045382e..0000000000 --- a/samples/README.md +++ /dev/null @@ -1,27 +0,0 @@ -Build procedure - -We provide Makefile and CMakeLists.txt to build the samples seperately. - -1.Makefile supports shared lib of hip-rocclr runtime and nvcc. - -To build a sample, just type in sample folder, - -make - - - -2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime. - -To build a sample, type in sample folder, - -mkdir build (if build folder is missing) - -cd build - -cmake .. - -make - -If you want debug version, follow, - -cmake -DCMAKE_BUILD_TYPE=Debug .. \ No newline at end of file diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake old mode 100755 new mode 100644 index 839b90befb..1677d93a20 --- a/tests/hit/HIT.cmake +++ b/tests/hit/HIT.cmake @@ -303,7 +303,6 @@ macro(MAKE_TEST _config exe) add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) endif() set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR}) - set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR}) endmacro() macro(MAKE_NAMED_TEST _config exe testname) diff --git a/tests/performance/compute/hipPerfMandelbrot.cpp b/tests/performance/compute/hipPerfMandelbrot.cpp deleted file mode 100644 index c4234d8c37..0000000000 --- a/tests/performance/compute/hipPerfMandelbrot.cpp +++ /dev/null @@ -1,747 +0,0 @@ -/* - Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include -#include -#include -#include -#include - -typedef struct { - double x; - double y; - double width; -} coordRec; - -coordRec coords[] = { - {0.0, 0.0, 4.0}, // Whole set - {0.0, 0.0, 0.00001}, // All black - {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -template -__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -template -__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*(float)i); - float y0 = (float)(yPos + yStep*(float)j); - - float x = x0; - float y = y0; - -#define FAST - uint iter = 0; - float tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - float savx = x; - float savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) { -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) { -#endif - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } while (stay && iter); - } - - - out[tid] = (uint)ccount; - -}; - - -template -__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*i); - double y0 = (double)(yPos + yStep*j); - - double x = x0; - double y = y0; - - uint iter = 0; - double tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - out[tid] = iter; -}; - - -template -__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*(double)i); - double y0 = (double)(yPos + yStep*(double)j); - - double x = x0; - double y = y0; - -#define FAST - uint iter = 0; - double tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - double savx = x; - double savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) -#endif - { - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } - while (stay && iter); - - } - out[tid] = (uint)ccount; -}; - -static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; - -// Expected results for each kernel run at each coord -unsigned long long expectedIters[] = { - 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, - 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, - 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, - 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; - -class hipPerfMandelBrot { - public: - hipPerfMandelBrot(); - ~hipPerfMandelBrot(); - - void setNumKernels(unsigned int num) { - numKernels = num; - } - - unsigned int getNumKernels() { - return numKernels; - } - - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } - - void open(int deviceID); - void run(unsigned int testCase, unsigned int deviceId); - void printResults(void); - - // array of funtion pointers - typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - // Wrappers - void float_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - hipStream_t streams[2]; - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numKernels; - unsigned int numStreams; - - std::map> results; - unsigned int width_; - unsigned int bufSize; - unsigned int maxIter; - unsigned int coordIdx; - volatile unsigned long long totalIters = 0; - int numCUs; - static const unsigned int numLoops = 10; -}; - - -hipPerfMandelBrot::hipPerfMandelBrot() {} - -hipPerfMandelBrot::~hipPerfMandelBrot() {} - -void hipPerfMandelBrot::open(int deviceId) { - - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - std::cout << "info: didn't find any GPU! skipping the test!\n"; - passed(); - return; - } - - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; - - numCUs = props.multiProcessorCount; -} - - -void hipPerfMandelBrot::printResults() { - - int numkernels = getNumKernels(); - int numStreams = getNumStreams(); - - std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " - << numStreams << " streams (s)" << std::endl; - - std::map>:: iterator itr; - for (itr = results.begin(); itr != results.end(); itr++) { - std::cout << "\n" << std::setw(20) << itr->first << " "; - for(auto i : results[itr->first]) { - std::cout << std::setw(10) << i << " "; - } - } - results.clear(); - - std::cout << std::endl; -} - - -// Wrappers for the kernel launches -void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, - maxIter); - - -} - - -void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { - - unsigned int numStreams = getNumStreams(); - - funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, - &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; - - // Maximum iteration count - maxIter = 32768; - - uint * hPtr[numKernels]; - uint * dPtr[numKernels]; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - for (uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - - // Allocate memory on the host and device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - } - - - // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory asynchronously and concurrently from host to device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - } - - // Synchronize to make sure all the copies are completed - HIPCHECK(hipStreamSynchronize(0)); - - int kernelIdx; - if(testCase == 0 || testCase == 5 || testCase == 10) { - kernelIdx = 0; - } - - else if(testCase == 1 || testCase == 6 || testCase == 11) { - kernelIdx = 1; - } - else if(testCase == 2 || testCase == 7 || testCase == 12) { - kernelIdx = 2; - } - else if(testCase == 3 || testCase == 8 || testCase == 13){ - kernelIdx = 3; - } - - - double totalTime = 0.0; - - for (unsigned int k = 0; k < numLoops; k++) { - - coordIdx = testCase % numCoords; - - if ((testCase == 0 || testCase == 1 || testCase == 2 || - testCase == 5 || testCase == 6 || testCase == 7 || - testCase == 10 || testCase == 11 || testCase == 12)) { - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - - } - - - else { - double xStep = coords[coordIdx].width / (double)width_; - double yStep = -coords[coordIdx].width / (double)width_; - double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; - double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } - - - } - - // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); - } - - - for(uint i = 0; i < numKernels; i++) { - checkData(hPtr[i]); - - int j =0; - while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { - j++; - } - - if(j==30) { - std::cout << "Incorrect iteration count detected. "; - } - - } - - - // Compute GFLOPS. There are 7 FLOPs per iteration - double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) / - (totalTime / (double)numLoops); - - - std::vector kernelName = {"float", "float_unroll", - "double", "double_unroll"}; - - // Print results except for Warm-up kernel - if(testCase!=100) { - results[kernelName[testCase % 4]].push_back(perf); - } - - - for(uint i = 0 ; i < numStreams; i++) { - HIPCHECK(hipStreamDestroy(streams[i])); - } - - - // Free host and device memory - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - -} - - -void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_; i++) { - ptr2[i] = value; - } -} - - -void hipPerfMandelBrot::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfMandelBrot mandelbrotCompute; - int deviceId = 0; - - mandelbrotCompute.open(deviceId); - - for (unsigned int testCase = 0; testCase < 3; testCase++) { - - - switch (testCase) { - - - case 0: { - // Warmup-kernel - default stream executes serially - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(100/*Random number*/, deviceId); - break; - } - - - case 1: { - // run all - sync - int i = 0; - do { - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - } - - - case 2: { - // run all - async - int i = 0; - do { - mandelbrotCompute.setNumStreams(2); - mandelbrotCompute.setNumKernels(2); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - - } - - - default: { - break; - } - - - } - - - - } - - - passed(); -} diff --git a/tests/performance/stream/hipPerfDeviceConcurrency.cpp b/tests/performance/stream/hipPerfDeviceConcurrency.cpp deleted file mode 100644 index 7d6699a9a2..0000000000 --- a/tests/performance/stream/hipPerfDeviceConcurrency.cpp +++ /dev/null @@ -1,289 +0,0 @@ -/* - Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -typedef struct { - double x; - double y; - double width; -} coordRec; - -static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter) { - - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -class hipPerfDeviceConcurrency { - public: - hipPerfDeviceConcurrency(); - ~hipPerfDeviceConcurrency(); - - void setNumGpus(unsigned int num) { - numDevices = num; - } - unsigned int getNumGpus() { - return numDevices; - } - - void open(void); - void close(void); - void run(unsigned int testCase, int numGpus); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numDevices; - unsigned int width_; - unsigned int bufSize; - unsigned int coordIdx; - unsigned long long totalIters = 0; -}; - - -hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} - -hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} - -void hipPerfDeviceConcurrency::open(void) { - - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - setNumGpus(nGpu); - if (nGpu < 1) { - std::cout << "info: didn't find any GPU! skipping the test!\n"; - passed(); - } - - -} - - -void hipPerfDeviceConcurrency::close() { -} - -void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { - - - static int deviceId; - uint * hPtr[numGpus]; - uint * dPtr[numGpus]; - hipStream_t streams[numGpus]; - int numCUs[numGpus]; - unsigned int maxIter[numGpus]; - unsigned long long expectedIters[numGpus]; - - int threads, threads_per_block, blocks; - float xStep, yStep, xPos, yPos; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, i)); - - if (testCase != 0) { - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device ID: " - << i << std::endl; - } - - numCUs[i] = props.multiProcessorCount; - int clkFrequency = 0; - HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); - - clkFrequency =(unsigned int)clkFrequency/1000; - - // Maximum iteration count - // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); - maxIter[i] = (maxIter[i] + 15) & ~15; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - HIPCHECK(hipStreamCreate(&streams[i])); - - // Allocate memory on the host and device - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - - // Prepare kernel launch parameters - threads = (bufSize/sizeof(uint)); - threads_per_block = 64; - blocks = (threads/threads_per_block) + (threads % threads_per_block); - - coordIdx = testCase % numCoords; - xStep = (float)(coords[coordIdx].width / (double)width_); - yStep = (float)(-coords[coordIdx].width / (double)width_); - xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory from host to device - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - - } - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], - dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); - - } - - for(int i = 0; i < numGpus; i++) { - HIPCHECK(hipStreamSynchronize(0)); - } - - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - HIPCHECK(hipSetDevice(deviceId)); - - // Copy data back from device to the host - HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); - - checkData(hPtr[i]); - expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; - - if (testCase != 0) { - checkData(hPtr[i]); - if(totalIters != expectedIters[i]) { - std::cout << "Incorrect iteration count detected" << std::endl; - } - } - - - HIPCHECK(hipStreamDestroy(streams[i])); - - // Free host and device memory - HIPCHECK(hipFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - if (testCase != 0) { - std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " - << all_kernel_time.count() << " (s) " << '\n' << std::endl; - } - - if(testCase == 0) { - deviceId++; - } - - -} - - -void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_ ; i++) { - ptr2[i] = value; - } -} - - -void hipPerfDeviceConcurrency::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfDeviceConcurrency deviceConcurrency; - - deviceConcurrency.open(); - - int nGpu = deviceConcurrency.getNumGpus(); - - // testCase = 0 refers to warmup kernel run - int testCase = 0; - - for (int i = 0; i < nGpu; i++) { - // Warm-up kernel on all devices - deviceConcurrency.run(testCase, 1); - } - - // Time for kernel on 1 device - deviceConcurrency.run(++testCase, 1); - - // Time for kernel on all available devices - deviceConcurrency.run(++testCase, nGpu); - - passed(); -} diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp b/tests/src/cg/hipCGGridGroupType.cpp old mode 100755 new mode 100644 similarity index 97% rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp rename to tests/src/cg/hipCGGridGroupType.cpp index 79f1cb1c38..db45c10512 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp +++ b/tests/src/cg/hipCGGridGroupType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -139,11 +139,7 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } + passed(); return 0; } diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp b/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp old mode 100755 new mode 100644 similarity index 97% rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp rename to tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp index 7407f266dd..11562dfff6 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp +++ b/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -139,11 +139,7 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } + passed(); return 0; } diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp b/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp old mode 100755 new mode 100644 similarity index 97% rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp rename to tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp index cb9d8d7c53..21f0348aec --- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp +++ b/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -139,11 +139,7 @@ int main() if (!deviceProperties.cooperativeLaunch) { std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } + passed(); return 0; } diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp b/tests/src/cg/hipCGMultiGridGroupType.cpp old mode 100755 new mode 100644 similarity index 92% rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp rename to tests/src/cg/hipCGMultiGridGroupType.cpp index 02be0a521b..5a0529867a --- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp +++ b/tests/src/cg/hipCGMultiGridGroupType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -34,8 +34,6 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) -#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; @@ -195,27 +193,15 @@ static void test_cg_multi_grid_group_type(int blockSize) } // Validate results - int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { - ASSERT_EQUAL(numGridsTestH[i][j], nGpu); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + //ASSERT_EQUAL(numGridsTestH[i][j], nGpu); + //ASSERT_EQUAL(gridRankTestH[i][j], i); ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert (false && "Grid rank in multi-gpu setup should be unique"); - } - } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp b/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp similarity index 83% rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp rename to tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp index 0830e807c3..dae72f4cf8 100644 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp +++ b/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -34,14 +34,11 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) -#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; static __global__ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, - int* gridRankTestD, int *thdRankTestD, int *isValidTestD, int *syncTestD, @@ -54,7 +51,6 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, sizeTestD[gIdx] = tg.size(); // Test thread_rank - gridRankTestD[gIdx] = this_multi_grid().grid_rank(); thdRankTestD[gIdx] = tg.thread_rank(); // Test is_valid @@ -114,7 +110,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) // Allocate host and device memory int nBytes = sizeof(int) * 2 * blockSize; int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; - int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; int *syncTestD[MaxGPUs], *syncResultD; @@ -122,13 +117,11 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess); - ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess); - ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess); @@ -142,18 +135,17 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) } // Launch Kernel - constexpr int NumKernelArgs = 6; + constexpr int NumKernelArgs = 5; hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; void* args[MaxGPUs * NumKernelArgs]; for (int i = 0; i < nGpu; i++) { ASSERT_EQUAL(hipSetDevice(i), hipSuccess); args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &gridRankTestD[i]; - args[i * NumKernelArgs + 2] = &thdRankTestD[i]; - args[i * NumKernelArgs + 3] = &isValidTestD[i]; - args[i * NumKernelArgs + 4] = &syncTestD[i]; - args[i * NumKernelArgs + 5] = &syncResultD; + args[i * NumKernelArgs + 1] = &thdRankTestD[i]; + args[i * NumKernelArgs + 2] = &isValidTestD[i]; + args[i * NumKernelArgs + 3] = &syncTestD[i]; + args[i * NumKernelArgs + 4] = &syncResultD; launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_base_type); launchParamsList[i].gridDim = 2; @@ -172,8 +164,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); - ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost), - hipSuccess); ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost), @@ -183,26 +173,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) } // Validate results - int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert (false && "Grid rank in multi-gpu setup should be unique"); - } - } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); @@ -212,7 +189,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess); - ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess); @@ -221,7 +197,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize) ASSERT_EQUAL(hipFree(syncResultD), hipSuccess); ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess); - ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess); diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp b/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp similarity index 83% rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp rename to tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp index 5975ffa068..2f2f378931 100644 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp +++ b/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -34,14 +34,11 @@ THE SOFTWARE. #include #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) -#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs) using namespace cooperative_groups; static __global__ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, - int* gridRankTestD, int *thdRankTestD, int *isValidTestD, int *syncTestD, @@ -54,7 +51,6 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, sizeTestD[gIdx] = group_size(mg); // Test thread_rank api - gridRankTestD[gIdx] = this_multi_grid().grid_rank(); thdRankTestD[gIdx] = thread_rank(mg); // Test is_valid api @@ -114,7 +110,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) // Allocate host and device memory int nBytes = sizeof(int) * 2 * blockSize; int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; - int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; int *syncTestD[MaxGPUs], *syncResultD; @@ -122,13 +117,11 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess); - ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess); - ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess); ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess); @@ -142,18 +135,17 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) } // Launch Kernel - constexpr int NumKernelArgs = 6; + constexpr int NumKernelArgs = 5; hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; void* args[MaxGPUs * NumKernelArgs]; for (int i = 0; i < nGpu; i++) { ASSERT_EQUAL(hipSetDevice(i), hipSuccess); args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &gridRankTestD[i]; - args[i * NumKernelArgs + 2] = &thdRankTestD[i]; - args[i * NumKernelArgs + 3] = &isValidTestD[i]; - args[i * NumKernelArgs + 4] = &syncTestD[i]; - args[i * NumKernelArgs + 5] = &syncResultD; + args[i * NumKernelArgs + 1] = &thdRankTestD[i]; + args[i * NumKernelArgs + 2] = &isValidTestD[i]; + args[i * NumKernelArgs + 3] = &syncTestD[i]; + args[i * NumKernelArgs + 4] = &syncResultD; launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_public_api); launchParamsList[i].gridDim = 2; @@ -172,8 +164,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); - ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost), - hipSuccess); ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost), @@ -183,26 +173,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) } // Validate results - int gridsSeen[MaxGPUs]; for (int i = 0; i < nGpu; ++i) { for (int j = 0; j < 2 * blockSize; ++j) { ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j); ASSERT_EQUAL(isValidTestH[i][j], 1); } ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert (false && "Grid rank in multi-gpu setup should be unique"); - } - } } ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); @@ -212,7 +189,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipSetDevice(i), hipSuccess); ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess); - ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess); ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess); @@ -221,7 +197,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize) ASSERT_EQUAL(hipFree(syncResultD), hipSuccess); ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess); - ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess); ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess); diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp b/tests/src/cg/hipCGThreadBlockType.cpp old mode 100755 new mode 100644 similarity index 95% rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp rename to tests/src/cg/hipCGThreadBlockType.cpp index dccac38bf3..4e1de9e44a --- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp +++ b/tests/src/cg/hipCGThreadBlockType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -166,16 +166,6 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - if (!deviceProperties.cooperativeLaunch) { - std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } - return 0; - } - // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp b/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp old mode 100755 new mode 100644 similarity index 94% rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp rename to tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp index b0a42782c0..d4c9402268 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp +++ b/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -135,16 +135,6 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - if (!deviceProperties.cooperativeLaunch) { - std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } - return 0; - } - // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp b/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp old mode 100755 new mode 100644 similarity index 94% rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp rename to tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp index e4a6a6e330..d13e58b059 --- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp +++ b/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../test_common.cpp * TEST: %t * HIT_END */ @@ -135,16 +135,6 @@ int main() ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - if (!deviceProperties.cooperativeLaunch) { - std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } else { - passed(); - } - return 0; - } - // Test block sizes which are powers of 2 int i = 0; while (true) { diff --git a/tests/src/kernel/hipShflTests.cpp b/tests/src/kernel/hipShflTests.cpp index 06b6a90b83..9b1cc73248 100644 --- a/tests/src/kernel/hipShflTests.cpp +++ b/tests/src/kernel/hipShflTests.cpp @@ -57,15 +57,6 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) } } -void getFactor(int& fact) { fact = 101; } -void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } -void getFactor(float& fact) { fact = 2.5; } -void getFactor(double& fact) { fact = 2.5; } -void getFactor(long& fact) { fact = 202; } -void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } -void getFactor(long long& fact) { fact = 303; } -void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } - template void runTest() { T* Matrix; @@ -86,10 +77,8 @@ void runTest() { cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T)); // initialize the input data - T factor; - getFactor(factor); for (i = 0; i < NUM; i++) { - Matrix[i] = (T)i + factor; + Matrix[i] = (T)i * 10l; } // allocate the memory on the device side @@ -135,11 +124,7 @@ void runTest() { int main() { runTest(); runTest(); - runTest(); runTest(); runTest(); - runTest(); - runTest(); - runTest(); passed(); } diff --git a/tests/src/kernel/hipShflUpDownTest.cpp b/tests/src/kernel/hipShflUpDownTest.cpp index cd3900aee5..553087ce45 100644 --- a/tests/src/kernel/hipShflUpDownTest.cpp +++ b/tests/src/kernel/hipShflUpDownTest.cpp @@ -47,31 +47,13 @@ __global__ void shflUpSum(T* a, int size) { a[threadIdx.x] = val; } -template -__global__ void shflXorSum(T* a, int size) { - T val = a[threadIdx.x]; - for (int i = size/2; i > 0; i /= 2) - val += __shfl_xor(val, i, size); - a[threadIdx.x] = val; -} - -void getFactor(int& fact) { fact = 101; } -void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } -void getFactor(float& fact) { fact = 2.5; } -void getFactor(double& fact) { fact = 2.5; } -void getFactor(long& fact) { fact = 202; } -void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } -void getFactor(long long& fact) { fact = 303; } -void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } - template void runTestShflUp() { const int size = 32; T a[size]; T cpuSum = 0; - T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i + factor; + a[i] = i; cpuSum += a[i]; } T* d_a; @@ -91,9 +73,8 @@ void runTestShflDown() { const int size = 32; T a[size]; T cpuSum = 0; - T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i + factor; + a[i] = i; cpuSum += a[i]; } T* d_a; @@ -103,58 +84,19 @@ void runTestShflDown() { hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); if (a[0] != cpuSum) { hipFree(d_a); - failed("Shfl Down Sum did not match."); - } - hipFree(d_a); -} - -template -void runTestShflXor() { - const int size = 32; - T a[size]; - T cpuSum = 0; - T factor; getFactor(factor); - for (int i = 0; i < size; i++) { - a[i] = i + factor; - cpuSum += a[i]; - } - T* d_a; - hipMalloc(&d_a, sizeof(T) * size); - hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault); - hipLaunchKernelGGL(shflXorSum, 1, size, 0, 0, d_a, size); - hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); - if (a[0] != cpuSum) { - hipFree(d_a); - failed("Shfl Xor Sum did not match."); + failed("Shfl Up Sum did not match."); } hipFree(d_a); } int main() { runTestShflUp(); runTestShflUp(); - runTestShflUp(); runTestShflUp(); runTestShflUp(); - runTestShflUp(); - runTestShflUp(); - runTestShflUp(); runTestShflDown(); runTestShflDown(); - runTestShflDown(); runTestShflDown(); runTestShflDown(); - runTestShflDown(); - runTestShflDown(); - runTestShflDown(); - - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); - runTestShflXor(); passed(); } diff --git a/tests/src/p2p/hipPeerToPeer_simple.cpp b/tests/src/p2p/hipPeerToPeer_simple.cpp old mode 100755 new mode 100644 index 13779694e2..9f0982f353 --- a/tests/src/p2p/hipPeerToPeer_simple.cpp +++ b/tests/src/p2p/hipPeerToPeer_simple.cpp @@ -395,9 +395,6 @@ int main(int argc, char* argv[]) { if (gpuCount < 2) { printf("P2P application requires atleast 2 gpu devices\n"); - if (hip_skip_tests_enabled()) { - return hip_skip_retcode(); - } } else { if (p_tests & 0x100) { testPeerHostToDevice(false /*useAsyncCopy*/); diff --git a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp deleted file mode 100644 index f073d7f72e..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* - Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ -// Test Description: -/*The general idea of the application is to test how Cooperative Groups kernel -launches work when launching too many warps to the target device. This test -first queries the nominal warp size of the target device. It then walks through -block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For -each of these, it queries the maximum number of blocks that can fit in each SM. -It then queries the number of SMs on the target device. This will yield a -calculation for the maximum number of blocks that can be co-scheduled on this -device. - -The Cooperative Groups API says that users should not launch more than this -many warps (or blocks, etc.) to the target device. This test first tires to -launch 2x as many blcoks, to confirm that the runtime prevents such a launch -by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`). - -It then ensures that trying to launch too large of a kernel invocation does -not break the GPU by launching a kernel with exactly the maximum number of -blocks. - -Finally, we run the same test for a block size that is larger than the maximum -allowed by the device, to ensure that this case is properly detected by the -runtime and that nothing breaks.*/ - - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - - -#include -#include -#include "test_common.h" - - -static inline void hipCheckAndFail(hipError_t errval, - const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != hipSuccess) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - failed(""); - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - std::cerr << " Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - failed(""); - } -} -#define hipCheckErr(errval) \ - do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) - -static inline bool hipCheckExpected(hipError_t errval, - hipError_t expected_err, const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != expected_err) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - return false; - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - std::cerr << " Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - return false; - } - return true; -} - -static bool cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return false; - } - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return false; - } - return true; -} - -__global__ void test_kernel(long long *array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - array[rank] += clock64(); -} - -int main(int argc, char** argv) { - hipError_t err; - int device_num, FailFlag = 0; - // Alocate the host input buffer, and two device-focused buffers that we - // will use for our test. - unsigned int *dev_array[2]; - HIPCHECK(hipGetDeviceCount(&device_num)); - for (int dev = 0; dev < device_num; ++dev) { - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - HIPCHECK(hipSetDevice(dev)); - if (!cooperative_groups_support(dev)) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - - /*************************************************************************/ - /* Create the streams we will use in this test. **************************/ - hipStream_t streams[2]; - for (int i = 0; i < 2; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - /*************************************************************************/ - /* We will try to launch more waves than the GPU can fit. ***************/ - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); - int warp_size = device_properties.warpSize; - int num_sms = device_properties.multiProcessorCount; - int max_num_threads = device_properties.maxThreadsPerBlock; - - // Check single-thread block, all numbers of warps, then too-large block - for (int block_size = 0; block_size <= (max_num_threads + warp_size); - block_size += warp_size) { - if (block_size == 0) { - block_size = 1; - } - int max_blocks_per_sm; - // Calculate the device occupancy to know how many blocks can be run. - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - &max_blocks_per_sm, test_kernel, block_size, 0, - hipOccupancyDefault)); - - if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) { - std::cerr << "ERROR! Occupancy API indicated that we can have >0 "; - std::cerr << "blocks in a kernel when the block size is too large "; - std::cerr << "to work on the device." << std::endl; - std::cerr << "This is incorrect, and could possibly lead users "; - std::cerr << "to try to launch kernels that will fail." << std::endl; - //failed(""); - FailFlag = 1; - break; - } - - int desired_blocks = max_blocks_per_sm * num_sms; - bool expect_fail = false; - if (desired_blocks == 0) { - desired_blocks = 1; - expect_fail = true; - } - - /**********************************************************************/ - /* Set up data to pass into the kernel ********************************/ - - for (int i = 0; i < 2; i++) { - int test_size; - // Case where we expect to fail at launch. - if (i == 0) { - test_size = 2 * desired_blocks; - } else { - test_size = desired_blocks; - } - HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), - test_size * block_size * sizeof(long long))); - HIPCHECK(hipMemsetAsync(dev_array[i], 0, - test_size * block_size * sizeof(long long), - streams[i])); - } - - HIPCHECK(hipDeviceSynchronize()); - - /***********************************************************************/ - /* Launch the kernels **************************************************/ - void *coop_params[2][1]; - for (int i = 0; i < 2; i++) { - coop_params[i][0] = reinterpret_cast(&dev_array[i]); - } - - err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - 2 * desired_blocks, block_size, - coop_params[0], 0, streams[0]); - - hipError_t expect_to_see; - if (expect_fail) { - expect_to_see = hipErrorInvalidConfiguration; - } else { - expect_to_see = hipErrorCooperativeLaunchTooLarge; - } - if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) { - std::cerr << "ERROR! Tried to launch a cooperative kernel with "; - std::cerr << "too many warps." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << hipGetErrorString(expect_to_see); - std::cerr << " (" << expect_to_see << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - break; - } - - HIPCHECK(hipDeviceSynchronize()); - err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, block_size, - coop_params[1], 0, streams[1]); - - if (expect_fail) { - expect_to_see = hipErrorInvalidConfiguration; - } else { - expect_to_see = hipSuccess; - } - if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) { - std::cerr << "ERROR! Tried to launch a cooperative kernel "; - std::cerr << "with a normal size, but a block size of "; - std::cerr << desired_blocks << std::endl; - std::cerr << "This SHOULD have returned "; - std::cerr << hipGetErrorString(expect_to_see); - std::cerr << " (" << expect_to_see << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - break; - } - - HIPCHECK(hipDeviceSynchronize()); - - if (block_size == 1) { - block_size = 0; - } - for (int m = 0; m < 2; ++m) { - HIPCHECK(hipFree(dev_array[m])); - } - } - for (int m = 0; m < 2; ++m) { - HIPCHECK(hipStreamDestroy(streams[m])); - } - if (FailFlag == 1) { - for (int m = 0; m < 2; ++m) { - HIPCHECK(hipFree(dev_array[m])); - } - failed(""); - } - } - passed(); -} diff --git a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp deleted file mode 100644 index c9adc03b24..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp +++ /dev/null @@ -1,283 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -// Test Description: -/* -The general idea of the application is to test how Cooperative Groups kernel -launches to a stream interact with other kernels being launched to different -streams. - -For example: the HIP runtime will force cooperative kernel launches to run -serially, even if they are launched to different streams. However, -cooperative kernel launches can run in parallel with regular kernels that -are launched to other streams. This limitation is so that the cooperative -kernels do not conflict with one another for resources and potentially -deadlock the system. - -As such, this benchmark tests three situations: - - 1. Launching a cooperative kernel by itself to stream[0] - 2. Launching two cooperative kernels in parallel to stream[0] and stream[1] - 3. Launching two cooperative kernels in parallel to stream[0] and stream[1] - and launching a third non-cooperative kernel to stream[2] - -We time how long it takes to run each of these benchmarks and print it as -the output of the benchmark. The kernels themselves are just useless time- -wasting code so that the kernel takes a meaningful amount of time on the -GPU before it exits. We only launch a single wavefront for each kernel, so -any serialization should not be because of GPU occupancy concerns. - -If test #2 takes roughly twice as long as #1, that implies that cooperative -kernels are properly serialized with each other by the runtime. - -If test #3 takes the same amount of time as test #2, that implies that -regular kernels can properly run in parallel with cooperative kernels. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include "test_common.h" - -static inline void hipCheckAndFail(hipError_t errval, - const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != hipSuccess) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << "Location: " << file << ":" << line << std::endl; - failed(""); - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << "Location: " << file << ":" << line << std::endl; - std::cerr << "Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - failed(""); - } -} -#define hipCheckErr(errval) \ - do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) - -static int cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -__global__ void test_kernel(uint32_t loops, unsigned long long *array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long start_clock = clock64(); - while (clock64() < (start_clock+1000000)) {} - array[rank] += clock64(); - } -} - -int main(int argc, char** argv) { - hipError_t err; - /*************************************************************************/ - int device_num = 0, loops = 1000, FailFlag = 0; - /* Create the streams we will use in this test. **************************/ - hipStream_t streams[3]; - // Alocate the host input buffer, and two device-focused buffers that we - // will use for our test. - unsigned long long *dev_array[3]; - HIPCHECK(hipGetDeviceCount(&device_num)); - for (int dev = 0; dev < device_num; ++dev) { - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - HIPCHECK(hipSetDevice(dev)); - if (!cooperative_groups_support(dev)) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - - /*************************************************************************/ - /* We will launch enough waves to fill up all of the GPU *****************/ - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); - int warp_size = device_properties.warpSize; - int num_sms = device_properties.multiProcessorCount; - int desired_blocks = 1; - std::cout << "Device: " << dev << std::endl; - std::cout << "Device name: " << device_properties.name << std::endl; - - int max_blocks_per_sm; - // Calculate the device occupancy to know how many blocks can be run. - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, - test_kernel, - warp_size, 0)); - - if (desired_blocks > max_blocks_per_sm * num_sms) { - std::cerr << "The requested number of blocks will not fit on the GPU"; - std::cerr << std::endl; - std::cerr << "You requested " << desired_blocks << " but we can only "; - std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl; - failed(""); - } - - /*************************************************************************/ - for (int i = 0; i < 3; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - /*************************************************************************/ - /* Set up data to pass into the kernel ***********************************/ - - for (int i = 0; i < 3; i++) { - HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), - warp_size * sizeof(long long))); - HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), - streams[i])); - } - - HIPCHECK(hipDeviceSynchronize()); - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - void *coop_params[3][2]; - for (int i = 0; i < 3; i++) { - coop_params[i][0] = reinterpret_cast(&loops); - coop_params[i][1] = reinterpret_cast(&dev_array[i]); - } - - std::cout << "Launching a single cooperative kernel..." << std::endl; - auto single_start = std::chrono::system_clock::now(); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, warp_size, - coop_params[0], 0, streams[0])); - - HIPCHECK(hipDeviceSynchronize()); - auto single_end = std::chrono::system_clock::now(); - std::cout << "Launching 2 cooperative kernels to different streams..."; - std::cout << std::endl; - - auto double_start = std::chrono::system_clock::now(); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, warp_size, - coop_params[0], 0, streams[0])); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, warp_size, - coop_params[1], 0, streams[1])); - - HIPCHECK(hipDeviceSynchronize()); - auto double_end = std::chrono::system_clock::now(); - std::cout << "Launching 2 cooperative kernels and 1 normal kernel..."; - std::cout << std::endl; - - auto triple_start = std::chrono::system_clock::now(); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, warp_size, - coop_params[0], 0, streams[0])); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, warp_size, - coop_params[1], 0, streams[1])); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), - 0, streams[2], loops, dev_array[2]); - err = hipGetLastError(); - hipCheckErr(err); - - HIPCHECK(hipDeviceSynchronize()); - auto triple_end = std::chrono::system_clock::now(); - std::chrono::duration single_kernel_time = - (single_end - single_start); - std::chrono::duration double_kernel_time = - (double_end - double_start); - std::chrono::duration triple_kernel_time = - (triple_end - triple_start); - - std::cout << "A single kernel took:" << std::endl; - std::cout << " " << single_kernel_time.count(); - std::cout << " seconds" << std::endl; - std::cout << std::endl; - std::cout << "Two cooperative kernels that could run together took:"; - std::cout << std::endl; - std::cout << " " << double_kernel_time.count(); - std::cout << " seconds" << std::endl; - std::cout << std::endl; - std::cout << "Two coop kernels and a third regular kernel took:"; - std::cout << std::endl << " "; - std::cout << triple_kernel_time.count(); - std::cout << " seconds" << std::endl; - - std::cout << "Testing whether these times make sense.." << std::endl; - // Test that two cooperative kernels is roughly twice as long as one - if (double_kernel_time < 1.8 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Two cooperative kernels launched at the same "; - std::cerr << "time did not take roughly twice as long as a single "; - std::cerr << "cooperative kernel." << std::endl; - std::cerr << "Were they truly serialized?" << std::endl; - FailFlag = 1; - break; - } - - // Test that the three kernels together took roughly as long as two - // cooperative kernels. - if (triple_kernel_time > 1.1 * double_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Launching a normal kernel in parallel with two "; - std::cerr << "back-to-back cooperative kernels still ended up taking "; - std::cerr << "more than 10% longer than the two cooperative kernels "; - std::cerr << "alone." << std::endl; - std::cerr << "Is the normal kernel being serialized with the "; - std::cerr << "cooperative kernels on different streams?" << std::endl; - FailFlag = 1; - break; - } - for (int k = 0; k < 3; ++k) { - HIPCHECK(hipFree(dev_array[k])); - HIPCHECK(hipStreamDestroy(streams[k])); - } - } - if (FailFlag == 1) { - for (int k = 0; k < 3; ++k) { - HIPCHECK(hipFree(dev_array[k])); - HIPCHECK(hipStreamDestroy(streams[k])); - } - failed(""); - } - passed(); -} diff --git a/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp b/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp deleted file mode 100644 index 46ad7ea7a4..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp +++ /dev/null @@ -1,303 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -// Test Description: -/*The general idea of the application is to create a buffer of width N. N is a -command line parameter, and the user will need to make sure that we can fit -two buffers of N unsigned integers onto the target GPU at the same time. - -We then launch a fixed number of warps to the GPU. This number is calculated -to fill the GPU with as many warps as can simultaneously run on the GPU. -The threads in these warps then walk over two arrays. First, values from -A[offset] are added into B[offset]. After all of A is added into all of B -in this element-wise manner, all of the waves barrier with one another. - -After the barrier, the waves start adding values from B[mirror_offset] into -A[offset]. Mirror offset means that the wave that is writing into A[7] is -reading from B[7 before the last value]. This was probably written by a -different thread before the barrier. - -After going through this loop a certain number of times, the kernel ends and -we read the arrays back out and recalculate this algorithm serially on the -CPU. We compare the serial version to the version that has inter-thread data -sharing and barriers and ensure they result in the same answer. - -If they do have the same answer, then we can pretty confidently say that -writing from thread X and then hitting a barrier allows thread Y to see the -values.*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ -#include -#include -#include "test_common.h" - -static inline void hipCheckAndFail(hipError_t errval, - const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != hipSuccess) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - exit(errval); - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - std::cerr << " Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - failed(""); - } -} -#define hipCheckErr(errval)\ - do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) - -static int cooperative_groups_support(int device_id) { - hipError_t err; - - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -static int verify_coop_arrays(unsigned int loops, unsigned int *host_input, - unsigned int *first_array, - unsigned int *second_array, - unsigned int array_len) { - unsigned int *host_first_array = host_input; - unsigned int *host_second_array = (unsigned int*)calloc(array_len, - sizeof(int)); - - for (int i = 0; i < loops; i++) { - for (int offset = 0; offset < array_len; offset++) { - host_second_array[offset] += host_first_array[offset]; - } - - for (int offset = 0; offset < array_len; offset++) { - unsigned int swizzle_offset = array_len - offset - 1; - host_first_array[offset] += host_second_array[swizzle_offset]; - } - } - - for (int i = 0; i < array_len; i++) { - if (host_first_array[i] != first_array[i]) { - std::cerr << "Test failure!" << std::endl; - std::cerr << " host_first_array[" << i << "] contains the "; - std::cerr << "value " << host_first_array[i] << std::endl; - std::cerr << " GPU first_array[" << i << "] contains the "; - std::cerr << "value " << first_array[i] << std::endl; - return -1; - } - if (host_second_array[i] != second_array[i]) { - std::cerr << "Test failure!" << std::endl; - std::cerr << " host_second_array[" << i << "] contains the "; - std::cerr << "value " << host_second_array[i] << std::endl; - std::cerr << " GPU second_array[" << i << "] contains the "; - std::cerr << "value " << second_array[i] << std::endl; - return -1; - } - } - - std::cout << "Coop test appears to work properly!" << std::endl; - free(host_second_array); - return 0; -} - -__global__ void -coop_kernel(unsigned int *first_array, unsigned int *second_array, - unsigned int loops, unsigned int array_len) { - cooperative_groups::grid_group grid = cooperative_groups::this_grid(); - unsigned int rank = grid.thread_rank(); - unsigned int grid_size = grid.size(); - - for (int i = 0; i < loops; i++) { - // The goal of this loop is to directly add in values from - // array one into array two, on a per-wave basis. - for (int offset = rank; offset < array_len; offset += grid_size) { - second_array[offset] += first_array[offset]; - } - - grid.sync(); - - // The goal of this loop is to pull data the "mirror" lane in - // array two and add it back into array one. This causes inter- - // thread swizzling. - for (int offset = rank; offset < array_len; offset += grid_size) { - unsigned int swizzle_offset = array_len - offset - 1; - first_array[offset] += second_array[swizzle_offset]; - } - - grid.sync(); - } -} - -int main(int argc, char** argv) { - hipError_t err; - /*************************************************************************/ - /* Parse the command line parameters *************************************/ - // Arguments to pull out of the command line. - int device_num = 0, loops = 2, width = 4096, flag = 0; - HIPCHECK(hipGetDeviceCount(&device_num)); - for (int dev = 0; dev < device_num; ++dev) { - std::cout << "Device number: " << dev << std::endl; - std::cout << "Loops: " << loops << std::endl; - std::cout << "Width: " << width << std::endl; - - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - HIPCHECK(hipSetDevice(dev)); - - if (!cooperative_groups_support(dev)) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - - /*************************************************************************/ - /* We will launch enough waves to fill up all of the GPU *****************/ - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); - - int warp_size = device_properties.warpSize; - int num_sms = device_properties.multiProcessorCount; - - std::cout << "Device name: " << device_properties.name << std::endl; - std::cout << std::endl; - - // Calculate the device occupancy to know how many blocks can be run. - int max_blocks_per_sm; - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, - coop_kernel, - warp_size, 0)); - - int total_blocks = max_blocks_per_sm * num_sms; - - /*************************************************************************/ - /* Create the streams we will use in this test. **************************/ - hipStream_t streams[2]; - for (int i = 0; i < 2; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - /*************************************************************************/ - /* Set up data to pass into the kernel ***********************************/ - - // Alocate the host input buffer, and two device-focused buffers that we - // will use for our test. - unsigned int *input_buffer = (unsigned int*)calloc(width, - sizeof(unsigned int)); - for (int i = 0; i < width; i++) { - input_buffer[i] = i; - } - - unsigned int *first_dev_array; - HIPCHECK(hipMalloc(reinterpret_cast(&first_dev_array), - width * sizeof(unsigned int))); - - HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer, - width * sizeof(unsigned int), - hipMemcpyHostToDevice, streams[0])); - - unsigned int *second_dev_array; - HIPCHECK(hipMalloc(reinterpret_cast(&second_dev_array), - width * sizeof(unsigned int))); - HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int), - streams[0])); - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - std::cout << "Launching a cooperative kernel with " << total_blocks; - std::cout << " thread blocks, each with " << warp_size << " threads"; - std::cout << std::endl; - - void *coop_params[4]; - coop_params[0] = reinterpret_cast(&first_dev_array); - coop_params[1] = reinterpret_cast(&second_dev_array); - coop_params[2] = reinterpret_cast(&loops); - coop_params[3] = reinterpret_cast(&width); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(coop_kernel), - total_blocks, warp_size, coop_params, - 0, streams[0])); - - /*************************************************************************/ - /* Read back the buffers and print out their data ************************/ - unsigned int *first_array = (unsigned int*)calloc(width, - sizeof(unsigned int)); - unsigned int *second_array = (unsigned int*)calloc(width, - sizeof(unsigned int)); - HIPCHECK(hipMemcpyAsync(first_array, first_dev_array, - width * sizeof(unsigned int), - hipMemcpyDeviceToHost, streams[0])); - - HIPCHECK(hipMemcpyAsync(second_array, second_dev_array, - width * sizeof(unsigned int), - hipMemcpyDeviceToHost, streams[0])); - - std::cout << "Waiting for cooperative work to finish..." << std::endl; - std::cout << std::flush; - - HIPCHECK(hipStreamSynchronize(streams[0])); - - - int ret_val = 0; - - std::cout << "Attemping to verify buffers." << std::endl; - std::cout << std::flush; - ret_val = verify_coop_arrays(loops, input_buffer, first_array, - second_array, width); - if (!ret_val) { - std::cout << "It appears that inter-thread data sharing at "; - std::cout << "grid_group sync points works properly!" << std::endl; - } else { - flag = 1; - } - for (int k = 0; k < 2; ++k) { - HIPCHECK(hipStreamDestroy(streams[k])); - } - HIPCHECK(hipFree(first_dev_array)); - HIPCHECK(hipFree(second_dev_array)); - free(input_buffer); - free(first_array); - free(second_array); - } - if (!flag) { - passed(); - } else { - failed(""); - } -} diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp deleted file mode 100644 index b75725fed4..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp +++ /dev/null @@ -1,568 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -// Test Description: -/*The general idea of the application is to test how Cooperative Groups kernel -launches work when launching too many warps to multiple target devices. This -tests the following failure modes for hipLaunchCooperativeKernelMultiDevice: - 1) Do not launch more warps to any device than can fit on that device - 2) All device targets for the multi-device launch function must be different - 3) All streams must be explicit (non-NULL) - 4) The kernels sent in must be identical between devices - 5) The grid and block sizes must be identical between devices - 6) The block dimensions must be non-zero - 7) The dynamic shared memory size must be identical between devices. - -This test ensures that the proper error conditions are returned, even if the -target kernel does not actually use any fo the cooperative groups features. - -Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm -do not have these constraints. As such, the test checks to see whether they -should fail or succeed and compares this to what actually happens. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - - -#include -#include -#include "test_common.h" - -static inline void hipCheckAndFail(hipError_t errval, - const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != hipSuccess) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - failed(""); - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - std::cerr << " Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - failed(""); - } -} -#define hipCheckErr(errval) \ - do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) - -static int cooperative_groups_support(int device_id) { - hipError_t err; - - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - int multi_gpu_cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, - hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); - - if (!multi_gpu_cooperative_attribute) { - std::cerr << "Multi-GPU cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - if (device_properties.cooperativeMultiDeviceLaunch == 0) { - std::cerr << "Multi-GPU cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -static int support_for_separate_kernels(int device_id) { - hipError_t err; - - int separate_kernel_supported; - HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported, - hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, - device_id)); - if (!separate_kernel_supported) { - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) { - return 0; - } - return 1; -} - -static int support_for_separate_grid_sizes(int device_id) { - hipError_t err; - int separate_sizes_supported; - HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, - hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, - device_id)); - if (!separate_sizes_supported) { - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) { - return 0; - } - return 1; -} - -static int support_for_separate_block_dims(int device_id) { - hipError_t err; - int separate_sizes_supported; - HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, - hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, - device_id)); - if (!separate_sizes_supported) { - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) { - return 0; - } - return 1; -} - -static int support_for_separate_shared_sizes(int device_id) { - hipError_t err; - int separate_sizes_supported; - HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported, - hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, - device_id)); - if (!separate_sizes_supported) { - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) { - return 0; - } - return 1; -} - -__global__ void test_kernel(long long *array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - array[rank] += clock64(); -} - -__global__ void second_test_kernel(long long *array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - array[rank] += clock64(); -} - -int main(int argc, char** argv) { - hipError_t err; - /*************************************************************************/ - /* Parse the command line parameters *************************************/ - // Arguments to pull out of the command line. - int device_num, FailFlag = 0; - HIPCHECK(hipGetDeviceCount(&device_num)); - if (device_num < 2) { - std::cout << "This test requires atleast two gpus but the system has "; - std::cout << " only "<< device_num <(&good_dev_array[i]), - good_size)); - HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i])); - HIPCHECK(hipMalloc(reinterpret_cast(&bad_dev_array[i]), - bad_size)); - HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i])); - } - HIPCHECK(hipDeviceSynchronize()); - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - std::cout << "Launching a multi-GPU cooperative kernel with too many "; - std::cout << "warps..." << std::endl; - - void *dev_params[2][1]; - hipLaunchParams md_params[2]; - for (int i = 0; i < 2; i++) { - dev_params[i][0] = reinterpret_cast(&bad_dev_array[i]); - - md_params[i].func = reinterpret_cast(test_kernel); - md_params[i].gridDim = 2 * desired_blocks; - md_params[i].blockDim = warp_size; - md_params[i].sharedMem = 0; - md_params[i].stream = streams[i]; - md_params[i].args = dev_params[i]; - } - - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if (err != hipErrorCooperativeLaunchTooLarge) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with too many warps." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorCooperativeLaunchTooLarge ("; - std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } else { - std::cout << "\tProperly saw this return "; - std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl; - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel to the same "; - std::cout << "device twice..." << std::endl; - for (int i = 0; i < 2; i++) { - dev_params[i][0] = reinterpret_cast(&good_dev_array[i]); - md_params[i].gridDim = desired_blocks; - md_params[i].stream = streams[0]; - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if (err != hipErrorInvalidDevice) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "to the same device twice." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidDevice ("; - std::cerr << hipErrorInvalidDevice << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } else { - std::cout << "\tProperly saw this return "; - std::cout << "hipErrorInvalidDevice" << std::endl; - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel to the NULL "; - std::cout << "stream" << std::endl; - for (int i = 0; i < 2; i++) { - md_params[i].stream = NULL; - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if (err != hipErrorInvalidResourceHandle) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "to the NULL stream." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidResourceHandle ("; - std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } else { - std::cout << "\tProperly saw this return "; - std::cout << "hipErrorInvalidResourceHandle" << std::endl; - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with two "; - std::cout << "different kernels." << std::endl; - bool supports_sep_kernels = true; - for (int i = 0; i < 2; i++) { - md_params[i].stream = streams[i]; - if (!support_for_separate_kernels((dev + i))) { - supports_sep_kernels = false; - } - } - md_params[1].func = reinterpret_cast(second_test_kernel); - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if ((supports_sep_kernels && err != hipSuccess) || - (!supports_sep_kernels && err != hipErrorInvalidValue)) { - if (supports_sep_kernels) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different kernels." << std::endl; - std::cerr << "This SHOULD have succeeded with hipSuccess ("; - std::cerr << hipSuccess << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - } else { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different kernels." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidValue ("; - std::cerr << hipErrorInvalidValue << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - } - FailFlag = 1; - } else { - std::cout << "\tProperly saw this return "; - if (supports_sep_kernels) { - std::cout << "hipSuccess" << std::endl; - } else { - std::cout << "hipErrorInvalidValue" << std::endl; - } - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with two "; - std::cout << "different grid sizes." << std::endl; - bool supports_sep_sizes = true; - for (int i = 0; i < 2; i++) { - md_params[i].func = reinterpret_cast(test_kernel); - md_params[i].gridDim = i+1; - if (!support_for_separate_grid_sizes((dev + i))) { - supports_sep_sizes = false; - } - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if ((supports_sep_sizes && err != hipSuccess) || - (!supports_sep_sizes && err == hipErrorInvalidValue)) { - if (supports_sep_sizes) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different grid sizes." << std::endl; - std::cerr << "This SHOULD have succeeded with hipSuccess ("; - std::cerr << hipSuccess << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - } else { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different grid sizes." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidValue ("; - std::cerr << hipErrorInvalidValue << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } - } else { - std::cout << "\tProperly saw this return "; - if (supports_sep_kernels) { - std::cout << "hipSuccess" << std::endl; - } else { - std::cout << "hipErrorInvalidValue" << std::endl; - } - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with two "; - std::cout << "different block dimensions." << std::endl; - supports_sep_sizes = true; - for (int i = 0; i < 2; i++) { - md_params[i].gridDim = desired_blocks; - md_params[i].blockDim = i+1; - if (!support_for_separate_block_dims((dev + i))) { - supports_sep_sizes = false; - } - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if ((supports_sep_sizes && err != hipSuccess) || - (!supports_sep_sizes && err == hipErrorInvalidValue)) { - if (supports_sep_sizes) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different block dimensions." << std::endl; - std::cerr << "This SHOULD have succeeded with hipSuccess ("; - std::cerr << hipSuccess << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - } else { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different block dimensions." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidValue ("; - std::cerr << hipErrorInvalidValue << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } - } else { - std::cout << "\tProperly saw this return "; - if (supports_sep_kernels) { - std::cout << "hipSuccess" << std::endl; - } else { - std::cout << "hipErrorInvalidValue" << std::endl; - } - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with block "; - std::cout << "dimensions of zero." << std::endl; - for (int i = 0; i < 2; i++) { - md_params[i].blockDim = 0; - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if (err != hipErrorInvalidConfiguration) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with block dimensions of zero." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidConfiguration ("; - std::cerr << hipErrorInvalidConfiguration << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } else { - std::cout << "\tProperly saw this return "; - std::cout << "hipErrorInvalidConfiguration" << std::endl; - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with two "; - std::cout << "different shared memory sizes." << std::endl; - supports_sep_sizes = true; - for (int i = 0; i < 2; i++) { - md_params[i].blockDim = warp_size; - md_params[i].sharedMem = i; - if (!support_for_separate_shared_sizes((dev + i))) { - supports_sep_sizes = false; - } - } - err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); - if ((supports_sep_sizes && err != hipSuccess) || - (!supports_sep_sizes && err == hipErrorInvalidValue)) { - if (supports_sep_sizes) { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different shared memory sizes." << std::endl; - std::cerr << "This SHOULD have succeeded with hipSuccess ("; - std::cerr << hipSuccess << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - } else { - std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel "; - std::cerr << "with two different shared memory sizes." << std::endl; - std::cerr << "This SHOULD have failed with the error "; - std::cerr << "hipErrorInvalidValue ("; - std::cerr << hipErrorInvalidValue << ")." << std::endl; - std::cerr << "Instead, the launch returned " << hipGetErrorName(err); - std::cerr << " (" << err << ")" << std::endl; - FailFlag = 1; - } - } else { - std::cout << "\tProperly saw this return "; - if (supports_sep_kernels) { - std::cout << "hipSuccess" << std::endl; - } else { - std::cout << "hipErrorInvalidValue" << std::endl; - } - } - HIPCHECK(hipDeviceSynchronize()); - - std::cout << "Launching a multi-GPU cooperative kernel with maximum "; - std::cout << "number of warps..." << std::endl; - for (int i = 0; i < 2; i++) { - md_params[i].sharedMem = 0; - } - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - std::cout << "\tProperly launched." << std::endl; - - HIPCHECK(hipDeviceSynchronize()); - for (int m = 0; m < 2; ++m) { - HIPCHECK(hipFree(good_dev_array[m])); - HIPCHECK(hipFree(bad_dev_array[m])); - HIPCHECK(hipStreamDestroy(streams[m])); - } - if (FailFlag == 1) { - break; - } - } - if (FailFlag == 1) { - failed(""); - } else { - passed(); - } -} diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp deleted file mode 100644 index a0275d7ba5..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp +++ /dev/null @@ -1,581 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -// Test Description: -/*The general idea of the application is to test how multi-GPU Cooperative -Groups kernel launches to a stream interact with other things that may be -simultaneously running in the same streams. - -The HIP specification says that a multi-GPU cooperative launch will wait -until all of the streams it's using finish their work. Only then will the -cooperative kernel be launched to all of the devices. Then no other work -can take part in the any of the streams until all of the multi-GPU -cooperative work is done. - -However, there are flags that allow you to disable each of these -serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and -hipCooperativeLaunchMultiDeviceNoPostSync. - -As such, this benchmark tests the following five situations launching -to two GPUs (and thus two streams): - - 1. Normal multi-GPU cooperative kernel: - This should result in the following pattern: - Stream 0: Cooperative - Stream 1: Cooperative - 2. Regular kernel launches and multi-GPU cooperative kernel launches - with the default flags, resulting in the following pattern: - Stream 0: Regular --> Cooperative - Stream 1: --> Cooperative --> Regular - - 3. Regular kernel launches and multi-GPU cooperative kernel launches - that turn off "pre-sync". This should allow a cooperative kernel - to launch even if work is already in a stream pointing to - another GPU. - This should result in the following pattern: - Stream 0: Regular --> Cooperative - Stream 1: Cooperative --> Regular - - 4. Regular kernel launches and multi-GPU cooperative kernel launches - that turn off "post-sync". This should allow a new kernel to enter - a GPU even if another GPU still has a cooperative kernel on it. - This should result in the following pattern: - Stream 0: Regular --> Cooperative - Stream 1: --> Cooperative--> Regular - - 5. Regular kernel launches and multi-GPU cooperative kernel launches - that turn off both pre- and post-sync. This should allow any of - the kernels to launch to their GPU regardless of the status of - other kernels in other multi-GPU stream groups. - This should result in the following pattern: - Stream 0: Regular --> Cooperative - Stream 1: Cooperative --> Regular - -We time how long it takes to run each of these benchmarks and print it as -the output of the benchmark. The kernels themselves are just useless time- -wasting code so that the kernel takes a meaningful amount of time on the -GPU before it exits. We only launch a single wavefront for each kernel, so -any serialization should not be because of GPU occupancy concerns. - -If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that -cooperative kernels are serialized as expected. - -If test #5 takes roughly twice as long as #1, that implies that the -overlap-allowing flags work as expected. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include "test_common.h" - -static inline void hipCheckAndFail(hipError_t errval, - const char *file, int line) { - hipError_t last_err = hipGetLastError(); - if (errval != hipSuccess) { - std::cerr << "hip error: " << hipGetErrorString(errval); - std::cerr << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - failed(""); - } - if (last_err != errval) { - std::cerr << "Error: the return value of a function was not the same "; - std::cerr << "as the value returned by hipGetLastError()" << std::endl; - std::cerr << " Location: " << file << ":" << line << std::endl; - std::cerr << " Function returned: " << hipGetErrorString(errval); - std::cerr << " (" << errval << ")" << std::endl; - std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err); - std::cerr << " (" << last_err << ")" << std::endl; - failed(""); - } -} -#define hipCheckErr(errval) \ - do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0) - -static int cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - int multi_gpu_cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, - hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); - if (!multi_gpu_cooperative_attribute) { - std::cerr << "Multi-GPU cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - if (device_properties.cooperativeMultiDeviceLaunch == 0) { - std::cerr << "Multi-GPU cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -__global__ void test_coop_kernel(unsigned int loops, long long *array, - int fast_gpu) { - cooperative_groups::multi_grid_group mgrid = - cooperative_groups::this_multi_grid(); - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - if (mgrid.grid_rank() == fast_gpu) { - return; - } - - for (int i = 0; i < loops; i++) { - long long start_clock = clock64(); - while (clock64() < (start_clock+1000000)) {} - array[rank] += clock64(); - } -} - -__global__ void test_kernel(uint32_t loops, unsigned long long *array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long start_clock = clock64(); - while (clock64() < (start_clock+1000000)) {} - array[rank] += clock64(); - } -} - -int main(int argc, char** argv) { - hipError_t err; - int device_num, FailFlag = 0; - uint32_t loops = 2000; - uint32_t fast_loops = 1; - int32_t fast_gpu = -1; - HIPCHECK(hipGetDeviceCount(&device_num)); - if (device_num < 2) { - std::cout << "This test requires atleast two gpus but the system has "; - std::cout << " only "<< device_num < max_blocks_per_sm * num_sm) { - std::cerr << "The requested number of blocks will not fit on the GPU"; - std::cerr << std::endl; - std::cerr << "You requested " << desired_blocks << " but we can only "; - std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl; - failed(""); - } - - /*************************************************************************/ - /* Create the streams we will use in this test. **************************/ - hipStream_t streams[2]; - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipStreamCreate(&streams[i])); - } - - /*************************************************************************/ - /* Set up data to pass into the kernelx **********************************/ - - // Alocate the host input buffer, and two device-focused buffers that we - // will use for our test. - unsigned long long *dev_array[2]; - for (int i = 0; i < 2; i++) { - int good_size = desired_blocks * warp_size * sizeof(long long); - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipMalloc(reinterpret_cast(&dev_array[i]), good_size)); - HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i])); - } - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - void *dev_params[2][3]; - hipLaunchParams md_params[2]; - std::chrono::time_point start_time[6]; - std::chrono::time_point end_time[6]; - - std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n"; - std::cout << "This should result in the following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl; - std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl; - - for (int i = 0; i < 2; i++) { - dev_params[i][0] = reinterpret_cast(&loops); - dev_params[i][1] = reinterpret_cast(&dev_array[i]); - dev_params[i][2] = reinterpret_cast(&fast_gpu); - md_params[i].func = reinterpret_cast(test_coop_kernel); - md_params[i].gridDim = desired_blocks; - md_params[i].blockDim = warp_size; - md_params[i].sharedMem = 0; - md_params[i].stream = streams[i]; - md_params[i].args = dev_params[i]; - } - - start_time[0] = std::chrono::system_clock::now(); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[0] = std::chrono::system_clock::now(); - - std::cout << std::endl; - std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the "; - std::cout << "following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; - std::cout << "GPU " << (dev + 1) << ": --> Coop "; - std::cout << "--> Standard Kernel\n"; - fast_gpu = 1; - start_time[1] = std::chrono::system_clock::now(); - HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[0], loops, dev_array[0]); - HIPCHECK(hipGetLastError()); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[1], loops, dev_array[1]); - HIPCHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - fast_gpu = -1; - - std::cout << std::endl; - std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the "; - std::cout << "following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl; - std::cout << "GPU " << (dev + 1) << ": --> Long Coop"; - std::cout << " Kernel --> "; - std::cout << "Standard Kernel\n"; - fast_gpu = 0; - start_time[2] = std::chrono::system_clock::now(); - HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[0], loops, dev_array[0]); - HIPCHECK(hipGetLastError()); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[1], loops, dev_array[1]); - HIPCHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[2] = std::chrono::system_clock::now(); - fast_gpu = -1; - - std::cout << std::endl; - std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the "; - std::cout << "ability to overlap regular and cooperative kernels "; - std::cout << "only at the beginning." << std::endl; - std::cout << "This should result in the following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl; - std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard"; - std::cout<< " Kernel\n"; - fast_gpu = 0; - start_time[3] = std::chrono::system_clock::now(); - HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[0], loops, dev_array[0]); - HIPCHECK(hipGetLastError()); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, - hipCooperativeLaunchMultiDeviceNoPreSync)); - HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[1], loops, dev_array[1]); - HIPCHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[3] = std::chrono::system_clock::now(); - fast_gpu = -1; - - std::cout << std::endl; - std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the "; - std::cout << "ability to overlap regular and cooperative kernels "; - std::cout << "only at the end." << std::endl; - std::cout << "This should result in the following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; - std::cout << "GPU " << (dev + 1) << ": --> Coop --> "; - std::cout << "Standard Kernel\n"; - fast_gpu = 1; - start_time[4] = std::chrono::system_clock::now(); - HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[0], loops, dev_array[0]); - HIPCHECK(hipGetLastError()); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, - hipCooperativeLaunchMultiDeviceNoPostSync)); - HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[1], loops, dev_array[1]); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[4] = std::chrono::system_clock::now(); - fast_gpu = -1; - - std::cout << std::endl; - std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the "; - std::cout << "ability to overlap regular and cooperative kernels"; - std::cout << std::endl; - std::cout << "This should result in the following pattern:" << std::endl; - std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n"; - std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard"; - std::cout << " Kernel\n"; - start_time[5] = std::chrono::system_clock::now(); - HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[0], loops, dev_array[0]); - HIPCHECK(hipGetLastError()); - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, - hipCooperativeLaunchMultiDeviceNoPreSync | - hipCooperativeLaunchMultiDeviceNoPostSync)); - HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, - streams[1], loops, dev_array[1]); - HIPCHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice(dev + i)); - HIPCHECK(hipDeviceSynchronize()); - } - end_time[5] = std::chrono::system_clock::now(); - - std::chrono::duration single_kernel_time = - (end_time[0] - start_time[0]); - std::chrono::duration serialized_gpu0_time = - (end_time[1] - start_time[1]); - std::chrono::duration serialized_gpu1_time = - (end_time[2] - start_time[2]); - std::chrono::duration pre_overlapped_time = - (end_time[3] - start_time[3]); - std::chrono::duration post_overlapped_time = - (end_time[4] - start_time[4]); - std::chrono::duration overlapped_time = - (end_time[5] - start_time[5]); - - std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl; - std::cout << " " << single_kernel_time.count(); - std::cout << " seconds" << std::endl; - std::cout << std::endl; - std::cout << "Test 1: Serialized set of three kernels with GPU0"; - std::cout << " being long took:"; - std::cout << " " << serialized_gpu0_time.count(); - std::cout << " seconds" << std::endl; - std::cerr << "Expect between " << (2.7 * single_kernel_time.count()); - std::cerr << " and "; - std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; - std::cout << std::endl; - std::cout << "Test 2: Serialized set of three kernels with GPU1"; - std::cout << " being long took:" << std::endl; - std::cout << " " << serialized_gpu1_time.count(); - std::cout << " seconds" << std::endl; - std::cerr << "Expect between " << (2.7 * single_kernel_time.count()); - std::cerr << " and "; - std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; - std::cout << std::endl; - std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n"; - std::cout << " " << pre_overlapped_time.count(); - std::cout << " seconds" << std::endl; - std::cerr << "Expect between " << (1.7 * single_kernel_time.count()); - std::cerr << " and "; - std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; - std::cout << std::endl; - std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n"; - std::cout << " " << post_overlapped_time.count(); - std::cout << " seconds" << std::endl; - std::cerr << "Expect between " << (1.7 * single_kernel_time.count()); - std::cerr << " and "; - std::cerr << (2.3 * single_kernel_time.count()) << " seconds."; - std::cout << std::endl; - std::cout << "Test 5: Multiple kernels with overlap allowed took:\n"; - std::cout << " " << overlapped_time.count(); - std::cout << " seconds" << std::endl; - std::cerr << "Expect between " << (1.8 * single_kernel_time.count()); - std::cerr << " and "; - std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n"; - - // Test that fully not-overlapped kernels take roughly 3x as long as one - // cooperative kernel. - if (serialized_gpu0_time > 3.3 * single_kernel_time || - serialized_gpu0_time < 2.7 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Test 1, the first case where all kernels should be "; - std::cerr << "serialized, had a runtime that was very different "; - std::cerr << "than what was expected." << std::endl; - std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n"; - std::cerr << "Expected between "; - std::cerr << (2.7 * single_kernel_time.count()) << " and "; - std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; - std::cerr << "Were they truly serialized?" << std::endl; - FailFlag = 1; - } - - // Test that fully not-overlapped kernels take roughly 3x as long as one - // cooperative kernel. - if (serialized_gpu1_time > 3.3 * single_kernel_time || - serialized_gpu1_time < 2.7 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Test 2, the second case where all kernels should be "; - std::cerr << "serialized, had a runtime that was very different "; - std::cerr << "than what was expected." << std::endl; - std::cerr << "Was " << serialized_gpu1_time.count(); - std::cerr << " seconds." << std::endl; - std::cerr << "Expected between "; - std::cerr << (2.7 * single_kernel_time.count()) << " and "; - std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n"; - std::cerr << "Were they truly serialized?" << std::endl; - FailFlag = 1; - } - - // Test that kernels that can overlap only before the cooperative kernel - // launches kernels take roughly the same time (in this case) - if (pre_overlapped_time > 2.3 * single_kernel_time || - pre_overlapped_time < 1.7 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Test 3, the case where the last kernel is serialized, had "; - std::cerr << "a runtime that was very different than what was "; - std::cerr << "expected." << std::endl; - std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n"; - std::cerr << "Expected between "; - std::cerr << (1.7 * single_kernel_time.count()) << " and "; - std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; - FailFlag = 1; - } - - // Test that kernels that can overlap only after the cooperative kernel - // launches kernels take roughly the same time (in this case) - if (post_overlapped_time > 2.3 * single_kernel_time || - post_overlapped_time < 1.7 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Teste 4, the case where the first kernel is "; - std::cerr << "serialized, had a runtime that was very different "; - std::cerr << "than what was expected." << std::endl; - std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n"; - std::cerr << "Expected between "; - std::cerr << (1.7 * single_kernel_time.count()) << " and "; - std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n"; - FailFlag = 1; - } - - // Test that, with the right flags on the kernel launch, that we prevent - // incomplete launches from serializing the cooperative launch streams. - if (overlapped_time > 2.2 * single_kernel_time || - overlapped_time < 1.8 * single_kernel_time) { - std::cerr << "ERROR!" << std::endl; - std::cerr << "Test 5, the case where normal and cooperative kernel "; - std::cerr << "launches should overlap, does not appear to have done so."; - std::cerr << std::endl; - std::cerr << "Was " << overlapped_time.count() << " seconds.\n"; - std::cerr << "Expected between "; - std::cerr << (1.8 * single_kernel_time.count()) << " and "; - std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n"; - std::cerr << "Is the normal kernel being serialized with the "; - std::cerr << "cooperative kernels on different streams?" << std::endl; - FailFlag = 1; - } - for (int k = 0; k < 2; ++k) { - HIPCHECK(hipFree(dev_array[k])); - HIPCHECK(hipStreamDestroy(streams[k])); - } - if (FailFlag == 1) { - break; - } - } - if (FailFlag == 1) { - failed(""); - } else { - passed(); - } -} diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp deleted file mode 100644 index f2f9814dba..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp +++ /dev/null @@ -1,374 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -// Test Description: -/*The general idea of the application is to launch N warps to all GPUs detected -in the HIP system. N is a command-line parameter, but the user should set N -small enough that all warps can be on each of the GPUs at the same time. - -All of the warps do a "work loop". Within the work loop, every warp -atomically increments a global variable that is shared between both fo the -target GPUs. The value returned from this atomic increment entriely depends -on the order the warps from the GPUs arrive at the atomic instruction. Each -warp then stores the result into a global array based on its warp ID. - -We also add a sleep/wait loop into the code so that the last warp runs much -slower than everyone else. As such, it should store much larger values than -all the other warps. - -If there are no barrier within the loop, then warp 0 will likely ge to the -global variable the first time while all the other warps have each -incremented it many times. If the barrier properly works, then each warp -will increment the variable once per time through the loop, and all threads -will sleep on the barrier waiting for the last warp to finally catch up. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60 - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -static int cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - int multi_gpu_cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, - hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); - if (!multi_gpu_cooperative_attribute) { - std::cerr << "Multi-GPU cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - if (device_properties.cooperativeMultiDeviceLaunch == 0) { - std::cerr << "Multi-GPU cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -static int verify_barrier_buffer(unsigned int loops, unsigned int warps, - unsigned int *host_buffer, - unsigned int num_devs) { - unsigned int max_in_this_loop = 0; - for (unsigned int i = 0; i < loops; i++) { - max_in_this_loop += (warps * num_devs); - for (unsigned int j = 0; j < warps; j++) { - if (host_buffer[i*warps+j] > max_in_this_loop) { - std::cerr << "Barrier failure!" << std::endl; - std::cerr << " Buffer entry " << i*warps+j; - std::cerr << " contains the value " << host_buffer[i*warps+j]; - std::cerr << " but it should not be more than "; - std::cerr << max_in_this_loop << std::endl; - return -1; - } - } - } - std::cout << "\tBarriers work properly!" << std::endl; - return 0; -} - -static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { - unsigned int desired_val = 0; - for (int i = 0; i < loops; i++) { - if (i % 2 == 0) { - desired_val += 2; - } else { - desired_val *= 2; - } - } - std::cout << "Desired value is " << desired_val << std::endl; - if (array_val != desired_val) { - std::cerr << "ERROR! Multi-grid barrier does not appear to work."; - std::cerr << std::endl; - std::cerr << "Expected the multi-GPUs to work together to produce "; - std::cerr << "the value " << desired_val << std::endl; - std::cerr << "However, the entry returned from the multi-GPU "; - std::cerr << "kernel was " << array_val << std::endl; - return -1; - } - std::cout << "\tMulti-GPU barriers appear to work here." << std::endl; - return 0; -} - -__global__ void -test_kernel(unsigned int *atomic_val, unsigned int *global_array, - unsigned int *array, uint32_t loops) { - cooperative_groups::grid_group grid = cooperative_groups::this_grid(); - cooperative_groups::multi_grid_group mgrid = - cooperative_groups::this_multi_grid(); - unsigned rank = grid.thread_rank(); - unsigned global_rank = mgrid.thread_rank(); - - int offset = blockIdx.x; - for (int i = 0; i < loops; i++) { - // Make the last thread run way behind everyone else. - // If the grid barrier below fails, then the other threads may hit the - // atomicInc instruction many times before the last thread ever gets - // to it. - // As such, without the barrier, the last array entry will eventually - // contain a very large value, defined by however many times the other - // wavefronts make it through this loop. - // If the barrier works, then it will likely contain some number - // near "total number of blocks". It will be the last wavefront to - // reach the atomicInc, but everyone will have only hit the atomic once. - if (rank == (grid.size() - 1)) { - long long start_clock = clock64(); - while (clock64() < (start_clock+1000000)) {} - } - if (threadIdx.x == 0) { - array[offset] = atomicInc(atomic_val, UINT_MAX); - } - grid.sync(); - - // Make the last thread in the entire multi-grid run way behind - // everyone else. - // If the mgrid barrier below fails, then the two global_array entries - // will end up being out of sync, because the intermingling of adds - // and multiplies will not be aligned between to the two GPUs. - if (global_rank == (mgrid.size() - 1)) { - long long start_clock = clock64(); - while (clock64() < (start_clock+100000000)) {} - } - // During even iterations, add into your own array entry - // During odd iterations, add into your partner's array entry - unsigned grid_rank = mgrid.grid_rank(); - unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); - if (rank == (grid.size() - 1)) { - if (i % mgrid.num_grids() == 0) { - global_array[grid_rank] += 2; - } else { - global_array[inter_gpu_offset] *= 2; - } - } - mgrid.sync(); - offset += gridDim.x; - } -} - -int main(int argc, char** argv) { - hipError_t err; - int num_devices = 0; - uint32_t loops = 2; - uint32_t warps = 10; - uint32_t block_size = 1; - - std::cout << "Loops: " << loops << std::endl; - std::cout << "Warps: " << warps << std::endl; - std::cout << "Block size: " << block_size << std::endl; - - HIPCHECK(hipGetDeviceCount(&num_devices)); - if (num_devices < 2) { - std::cout << "Not enough GPUs to run test." << std::endl; - std::cout << "We require at least 2 GPUs, but only found "; - std::cout << num_devices << std::endl; - std::cout << "Skipping the test with PASSED result\n"; - passed(); - } - - uint32_t device_num[num_devices]; - - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - for (int i = 0; i < num_devices; i++) { - device_num[i] = i; - if (!cooperative_groups_support(device_num[i])) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - } - - /*************************************************************************/ - /* Test whether the requested size will fit on the GPU *******************/ - int warp_sizes[num_devices]; - int num_sms[num_devices]; - hipDeviceProp_t device_properties[num_devices]; - int warp_size = INT_MAX; - int num_sm = INT_MAX; - for (int i = 0; i < num_devices; i++) { - HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i])); - warp_sizes[i] = device_properties[i].warpSize; - if (warp_sizes[i] < warp_size) { - warp_size = warp_sizes[i]; - } - num_sms[i] = device_properties[i].multiProcessorCount; - if (num_sms[i] < num_sm) { - num_sm = num_sms[i]; - } - std::cout << "Device " << (i + 1); - std::cout << " name: " << device_properties[i].name << std::endl; - } - std::cout << std::endl; - - int num_threads_in_block = block_size * warp_size; - - // Calculate the device occupancy to know how many blocks can be run. - int max_blocks_per_sm_arr[num_devices]; - int max_blocks_per_sm = INT_MAX; - for (int i = 0; i < num_devices; i++) { - HIPCHECK(hipSetDevice(device_num[i])); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0)); - if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { - max_blocks_per_sm = max_blocks_per_sm_arr[i]; - } - } - - int requested_blocks = warps / block_size; - if (requested_blocks > max_blocks_per_sm * num_sm) { - std::cerr << "Requesting to run " << requested_blocks << " blocks, "; - std::cerr << "but we can only guarantee to simultaneously run "; - std::cerr << (max_blocks_per_sm * num_sm) << std::endl; - failed(""); - } - - /*************************************************************************/ - /* Set up data to pass into the kernel ***********************************/ - // Each block will output a single value per loop. - uint32_t total_buffer_len = requested_blocks*loops; - - // Alocate the buffer that will hold the kernel's output, and which will - // also be used to globally synchronize during GWS initialization - unsigned int *host_buffer[num_devices]; - unsigned int *kernel_buffer[num_devices]; - unsigned int *kernel_atomic[num_devices]; - hipStream_t streams[num_devices]; - for (int i = 0; i < num_devices; i++) { - host_buffer[i] = (unsigned int*)calloc(total_buffer_len, - sizeof(unsigned int)); - HIPCHECK(hipSetDevice(device_num[i])); - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer[i]), - total_buffer_len * sizeof(unsigned int))); - HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], - total_buffer_len * sizeof(unsigned int), - hipMemcpyHostToDevice)); - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic[i]), - sizeof(unsigned int))); - HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); - HIPCHECK(hipStreamCreate(&streams[i])); - } - - // Single kernel atomic shared between both devices; put it on the host - unsigned int* global_array; - HIPCHECK(hipHostMalloc(reinterpret_cast(&global_array), - num_devices * sizeof(unsigned int), 0)); - HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int))); - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - std::cout << "Launching a kernel with " << warps << " warps "; - std::cout << "in " << requested_blocks << " thread blocks."; - std::cout << std::endl; - - void *dev_params[num_devices][4]; - hipLaunchParams md_params[num_devices]; - for (int i = 0; i < num_devices; i++) { - dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); - dev_params[i][1] = reinterpret_cast(&global_array); - dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); - dev_params[i][3] = reinterpret_cast(&loops); - md_params[i].func = reinterpret_cast(test_kernel); - md_params[i].gridDim = requested_blocks; - md_params[i].blockDim = num_threads_in_block; - md_params[i].sharedMem = 0; - md_params[i].stream = streams[i]; - md_params[i].args = dev_params[i]; - } - - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0)); - HIPCHECK(hipDeviceSynchronize()); - - /*************************************************************************/ - /* Read back the buffers and print out its data **************************/ - for (int dev = 0; dev < num_devices; dev++) { - HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev], - total_buffer_len * sizeof(unsigned int), - hipMemcpyDeviceToHost)); - } - - for (unsigned int i = 0; i < loops; i++) { - for (int dev = 0; dev < num_devices; dev++) { - std::cout << "+++++++++++++++++ Device " << dev; - std::cout << "+++++++++++++++++" << std::endl; - for (unsigned int j = 0; j < requested_blocks; j++) { - std::cout << "Buffer entry " << (i*warps+j); - std::cout << " (written by warp " << j << ")"; - std::cout << " is " << host_buffer[dev][i*requested_blocks+j]; - std::cout << std::endl; - } - } - std::cout << "==========================\n"; - } - for (unsigned int dev = 0; dev < num_devices; dev++) { - std::cout << "Testing output from device " << dev << std::endl; - int local_ret_val = verify_barrier_buffer(loops, requested_blocks, - host_buffer[dev], num_devices); - if (local_ret_val) { - failed(""); - } - } - - std::cout << std::endl << "The multi-GPU shared updates contain:\n"; - for (int i = 0; i < num_devices; i++) { - std::cout << "Entry " << i << ": "; - std::cout << global_array[i] << std::endl; - } - int flag = 0; - for (int dev = 0; dev < num_devices; dev++) { - std::cout << "Testing multi-GPU output for entry " << dev << std::endl; - int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]); - if (local_ret_val) { - flag = 1; - } - } - for (int k = 0; k < num_devices; ++k) { - HIPCHECK(hipFree(kernel_buffer[k])); - HIPCHECK(hipFree(kernel_atomic[k])); - HIPCHECK(hipStreamDestroy(streams[k])); - free(host_buffer[k]); - } - if (flag == 1) { - failed(""); - } else { - passed(); - } -} diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp deleted file mode 100644 index 77aa63d3c6..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp +++ /dev/null @@ -1,233 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -// Test Description: -/*The general idea of the application is to launch N warps. N is a command-line -parameter, but the user should set N small enough that all warps can be on -the GPU at the same time. - -All of the warps do a "work loop". Within the work loop, every warp -atomically increments a global variable. The value returned from this atomic -increment entriely depends on the order the threads arrive at the atomic -instruction. Each warp then stores the result into a global array based on its -warp ID. - -We also add a sleep/wait loop into the code so that the last warp runs much -slower than everyone else. As such, it should store much larger values than -all the other warps. - -If there are no barrier within the loop, then the last warp will likely get to -the global variable the first time after all the other warps have each -incremented it many times. If the barrier properly works, then each warp -will increment the variable once per time through the loop, and all threads -will sleep on the barrier waiting for the last warp to finally catch up. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -static int cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -static int verify_barrier_buffer(unsigned int loops, unsigned int warps, - unsigned int *host_buffer) { - unsigned int max_in_this_loop = 0; - for (unsigned int i = 0; i < loops; i++) { - max_in_this_loop += warps; - for (unsigned int j = 0; j < warps; j++) { - if (host_buffer[i*warps+j] > max_in_this_loop) { - std::cerr << "Barrier failure!" << std::endl; - std::cerr << " Buffer entry " << i*warps+j; - std::cerr << " contains the value " << host_buffer[i*warps+j]; - std::cerr << " but it should not be more than "; - std::cerr << max_in_this_loop << std::endl; - return -1; - } - } - } - std::cout << "Barriers work properly!" << std::endl; - return 0; -} - -__global__ void -test_kernel(unsigned int *atomic_val, unsigned int *array, - unsigned int loops) { - cooperative_groups::grid_group grid = cooperative_groups::this_grid(); - unsigned rank = grid.thread_rank(); - - int offset = blockIdx.x; - for (int i = 0; i < loops; i++) { - // Make the last thread run way behind everyone else. - // If the barrier below fails, then the other threads may hit the - // atomicInc instruction many times before the last thread ever gets - // to it. - // As such, without the barrier, the last array entry will eventually - // contain a very large value, defined by however many times the other - // wavefronts make it through this loop. - // If the barrier works, then it will likely contain some number - // near "total number of blocks". It will be the last wavefront to - // reach the atomicInc, but everyone will have only hit the atomic once. - if (rank == (grid.size() - 1)) { - long long start_clock = clock64(); - while (clock64() < (start_clock+1000000)) {} - } - - if (threadIdx.x == 0) { - array[offset] = atomicInc(&atomic_val[0], UINT_MAX); - } - grid.sync(); - offset += gridDim.x; - } -} - -int main(int argc, char** argv) { - hipError_t err; - int device_num; - uint32_t loops = 2; - uint32_t warps = 10; - uint32_t block_size = 1; - HIPCHECK(hipGetDeviceCount(&device_num)); - for (int dev = 0; dev < device_num; ++dev) { - std::cout << "Device number: " << dev << std::endl; - std::cout << "Loops: " << loops << std::endl; - std::cout << "Warps: " << warps << std::endl; - std::cout << "Block size: " << block_size << std::endl; - - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - HIPCHECK(hipSetDevice(dev)); - if (!cooperative_groups_support(dev)) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - - /*************************************************************************/ - /* Test whether the requested size will fit on the GPU *******************/ - int warp_size; - int num_sms; - int max_blocks_per_sm; - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, dev)); - warp_size = device_properties.warpSize; - num_sms = device_properties.multiProcessorCount; - - std::cout << "Device name: " << device_properties.name << std::endl; - std::cout << std::endl; - - int num_threads_in_block = block_size * warp_size; - - // Calculate the device occupancy to know how many blocks can be run. - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, - test_kernel, num_threads_in_block, 0)); - - int requested_blocks = warps / block_size; - if (requested_blocks > max_blocks_per_sm * num_sms) { - std::cerr << "Requesting to run " << requested_blocks << " blocks, "; - std::cerr << "but we can only guarantee to simultaneously run "; - std::cerr << (max_blocks_per_sm * num_sms) << std::endl; - failed(""); - } - - /*************************************************************************/ - /* Set up data to pass into the kernel ***********************************/ - // Each block will output a single value per loop. - uint32_t total_buffer_len = requested_blocks*loops; - - // Alocate the buffer that will hold the kernel's output, and which will - // also be used to globally synchronize during GWS initialization - unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len, - sizeof(unsigned int)); - - unsigned int *kernel_buffer; - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer), - total_buffer_len * sizeof(unsigned int))); - HIPCHECK(hipMemcpy(kernel_buffer, host_buffer, - total_buffer_len * sizeof(unsigned int), - hipMemcpyHostToDevice)); - - unsigned int *kernel_atomic; - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic), - sizeof(unsigned int))); - HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int))); - - /*************************************************************************/ - /* Launch the kernel *****************************************************/ - std::cout << "Launching a kernel with " << warps << " warps "; - std::cout << "in " << requested_blocks << " thread blocks."; - std::cout << std::endl; - - void *params[3]; - params[0] = reinterpret_cast(&kernel_atomic); - params[1] = reinterpret_cast(&kernel_buffer); - params[2] = reinterpret_cast(&loops); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - requested_blocks, - num_threads_in_block, params, 0, NULL)); - - /*************************************************************************/ - /* Read back the buffer and print out its data****************************/ - HIPCHECK(hipMemcpy(host_buffer, kernel_buffer, - total_buffer_len * sizeof(unsigned int), - hipMemcpyDeviceToHost)); - - for (unsigned int i = 0; i < loops; i++) { - for (unsigned int j = 0; j < requested_blocks; j++) { - std::cout << "Buffer entry " << (i*warps+j); - std::cout << " (written by warp " << j << ")"; - std::cout << " is " << host_buffer[i * requested_blocks + j]; - std::cout << std::endl; - } - std::cout << "==========================\n"; - } - int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer); - HIPCHECK(hipFree(kernel_buffer)); - HIPCHECK(hipFree(kernel_atomic)); - if (ret_val == -1) { - failed(""); - } else { - passed(); - } - } -} diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp deleted file mode 100644 index ae793cf6a1..0000000000 --- a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp +++ /dev/null @@ -1,374 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -// Test Description: -/*The general idea of the application is to launch N warps to each of two GPUs. -N is a command-line parameter, but the user should set N small enough that all -warps can be on each of the GPUs at the same time. - -All of the warps do a "work loop". Within the work loop, every warp -atomically increments a global variable that is shared between both fo the -target GPUs. The value returned from this atomic increment entriely depends -on the order the warps from the GPUs arrive at the atomic instruction. Each -warp then stores the result into a global array based on its warp ID. - -We also add a sleep/wait loop into the code so that the last warp runs much -slower than everyone else. As such, it should store much larger values than -all the other warps. - -If there are no barrier within the loop, then warp 0 will likely ge to the -global variable the first time while all the other warps have each -incremented it many times. If the barrier properly works, then each warp -will increment the variable once per time through the loop, and all threads -will sleep on the barrier waiting for the last warp to finally catch up. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60 - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -static int cooperative_groups_support(int device_id) { - hipError_t err; - int cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute, - hipDeviceAttributeCooperativeLaunch, device_id)); - if (!cooperative_attribute) { - std::cerr << "Cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - int multi_gpu_cooperative_attribute; - HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute, - hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id)); - if (!multi_gpu_cooperative_attribute) { - std::cerr << "Multi-GPU cooperative launch support not available in "; - std::cerr << "the device attribute for device " << device_id; - std::cerr << std::endl; - return 0; - } - - hipDeviceProp_t device_properties; - HIPCHECK(hipGetDeviceProperties(&device_properties, device_id)); - if (device_properties.cooperativeLaunch == 0) { - std::cerr << "Cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - if (device_properties.cooperativeMultiDeviceLaunch == 0) { - std::cerr << "Multi-GPU cooperative group support not available in "; - std::cerr << "device properties." << std::endl; - return 0; - } - return 1; -} - -static int verify_barrier_buffer(unsigned int loops, unsigned int warps, - unsigned int *host_buffer, - unsigned int num_devs) { - unsigned int max_in_this_loop = 0; - for (unsigned int i = 0; i < loops; i++) { - max_in_this_loop += (warps * num_devs); - for (unsigned int j = 0; j < warps; j++) { - if (host_buffer[i*warps+j] > max_in_this_loop) { - std::cerr << "Barrier failure!" << std::endl; - std::cerr << " Buffer entry " << i*warps+j; - std::cerr << " contains the value " << host_buffer[i*warps+j]; - std::cerr << " but it should not be more than "; - std::cerr << max_in_this_loop << std::endl; - return -1; - } - } - } - std::cout << "\tBarriers work properly!" << std::endl; - return 0; -} - -static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { - unsigned int desired_val = 0; - for (int i = 0; i < loops; i++) { - if (i % 2 == 0) { - desired_val += 2; - } else { - desired_val *= 2; - } - } - std::cout << "Desired value is " << desired_val << std::endl; - if (array_val != desired_val) { - std::cerr << "ERROR! Multi-grid barrier does not appear to work."; - std::cerr << std::endl; - std::cerr << "Expected the multi-GPUs to work together to produce "; - std::cerr << "the value " << desired_val << std::endl; - std::cerr << "However, the entry returned from the multi-GPU "; - std::cerr << "kernel was " << array_val << std::endl; - return -1; - } - std::cout << "\tMulti-GPU barriers appear to work here." << std::endl; - return 0; -} - -__global__ void -test_kernel(unsigned int *atomic_val, unsigned int *global_array, - unsigned int *array, uint32_t loops) { - cooperative_groups::grid_group grid = cooperative_groups::this_grid(); - cooperative_groups::multi_grid_group mgrid = - cooperative_groups::this_multi_grid(); - unsigned rank = grid.thread_rank(); - unsigned global_rank = mgrid.thread_rank(); - - int offset = blockIdx.x; - for (int i = 0; i < loops; i++) { - // Make the last thread run way behind everyone else. - // If the grid barrier below fails, then the other threads may hit the - // atomicInc instruction many times before the last thread ever gets - // to it. - // As such, without the barrier, the last array entry will eventually - // contain a very large value, defined by however many times the other - // wavefronts make it through this loop. - // If the barrier works, then it will likely contain some number - // near "total number of blocks". It will be the last wavefront to - // reach the atomicInc, but everyone will have only hit the atomic once. - if (rank == (grid.size() - 1)) { - long long start_clock = clock64(); - while (clock64() < (start_clock + 1000000)) {} - } - if (threadIdx.x == 0) { - array[offset] = atomicInc(atomic_val, UINT_MAX); - } - grid.sync(); - - // Make the last thread in the entire multi-grid run way behind - // everyone else. - // If the mgrid barrier below fails, then the two global_array entries - // will end up being out of sync, because the intermingling of adds - // and multiplies will not be aligned between to the two GPUs. - if (global_rank == (mgrid.size() - 1)) { - long long start_clock = clock64(); - while (clock64() < (start_clock + 100000000)) {} - } - // During even iterations, add into your own array entry - // During odd iterations, add into your partner's array entry - unsigned grid_rank = mgrid.grid_rank(); - unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); - if (rank == (grid.size() - 1)) { - if (i % mgrid.num_grids() == 0) { - global_array[grid_rank] += 2; - } else { - global_array[inter_gpu_offset] *= 2; - } - } - mgrid.sync(); - offset += gridDim.x; - } -} - -int main(int argc, char** argv) { - hipError_t err; - int device_num = 0, flag = 0; - uint32_t loops = 2; - uint32_t warps = 10; - uint32_t block_size = 1; - HIPCHECK(hipGetDeviceCount(&device_num)); - if (device_num < 2) { - std::cout << "This test needs atleast two gpus but found only"; - std::cout << device_num << std::endl; - std::cout << "Hence skipping the test with pass result\n"; - passed(); - } - - for (int d = 0; d < (device_num - 1); ++d) { - std::cout << "First device number: " << d << std::endl; - std::cout << "Second device number: " << (d + 1) << std::endl; - std::cout << "Loops: " << loops << std::endl; - std::cout << "Warps: " << warps << std::endl; - std::cout << "Block size: " << block_size << std::endl; - - /*************************************************************************/ - /* Test whether target device supports cooperative groups ****************/ - for (int i = 0; i < 2; i++) { - if (!cooperative_groups_support((d + i))) { - std::cout << "Skipping the test with Pass result.\n"; - passed(); - } - } - - /*************************************************************************/ - /* Test whether the requested size will fit on the GPU *******************/ - int warp_sizes[2]; - int num_sms[2]; - hipDeviceProp_t device_properties[2]; - int warp_size = INT_MAX; - int num_sm = INT_MAX; - for (int i = 0; i < 2; i++) { - HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i))); - warp_sizes[i] = device_properties[i].warpSize; - if (warp_sizes[i] < warp_size) { - warp_size = warp_sizes[i]; - } - num_sms[i] = device_properties[i].multiProcessorCount; - if (num_sms[i] < num_sm) { - num_sm = num_sms[i]; - } - std::cout << "Device " << (d + i); - std::cout << " name: " << device_properties[i].name << std::endl; - } - std::cout << std::endl; - - int num_threads_in_block = block_size * warp_size; - - // Calculate the device occupancy to know how many blocks can be run. - int max_blocks_per_sm_arr[2]; - int max_blocks_per_sm = INT_MAX; - for (int i = 0; i < 2; i++) { - HIPCHECK(hipSetDevice((d + i))); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, - 0)); - if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { - max_blocks_per_sm = max_blocks_per_sm_arr[i]; - } - } - - int requested_blocks = warps / block_size; - if (requested_blocks > max_blocks_per_sm * num_sm) { - std::cerr << "Requesting to run " << requested_blocks << " blocks, "; - std::cerr << "but we can only guarantee to simultaneously run "; - std::cerr << (max_blocks_per_sm * num_sm) << std::endl; - failed(""); - } - - /*************************************************************************/ - /* Set up data to pass into the kernel ***********************************/ - // Each block will output a single value per loop. - uint32_t total_buffer_len = requested_blocks*loops; - - // Alocate the buffer that will hold the kernel's output, and which will - // also be used to globally synchronize during GWS initialization - unsigned int *host_buffer[2]; - unsigned int *kernel_buffer[2]; - unsigned int *kernel_atomic[2]; - hipStream_t streams[2]; - for (int i = 0; i < 2; i++) { - host_buffer[i] = (unsigned int*)calloc(total_buffer_len, - sizeof(unsigned int)); - HIPCHECK(hipSetDevice((d + i))); - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_buffer[i]), - total_buffer_len * sizeof(unsigned int))); - HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], - total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice)); - HIPCHECK(hipMalloc(reinterpret_cast(&kernel_atomic[i]), - sizeof(unsigned int))); - HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); - HIPCHECK(hipStreamCreate(&streams[i])); - } - - // Single kernel atomic shared between both devices; put it on the host - unsigned int* global_array; - HIPCHECK(hipHostMalloc(reinterpret_cast(&global_array), - 2 * sizeof(unsigned int), 0)); - HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int))); - - /*************************************************************************/ - /* Launch the kernels ****************************************************/ - std::cout << "Launching a kernel with " << warps << " warps "; - std::cout << "in " << requested_blocks << " thread blocks."; - std::cout << std::endl; - - void *dev_params[2][4]; - hipLaunchParams md_params[2]; - for (int i = 0; i < 2; i++) { - dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); - dev_params[i][1] = reinterpret_cast(&global_array); - dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); - dev_params[i][3] = reinterpret_cast(&loops); - md_params[i].func = reinterpret_cast(test_kernel); - md_params[i].gridDim = requested_blocks; - md_params[i].blockDim = num_threads_in_block; - md_params[i].sharedMem = 0; - md_params[i].stream = streams[i]; - md_params[i].args = dev_params[i]; - } - - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - HIPCHECK(hipDeviceSynchronize()); - - /*************************************************************************/ - /* Read back the buffers and print out its data **************************/ - for (int dev = 0; dev < 2; dev++) { - HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev], - total_buffer_len * sizeof(unsigned int), - hipMemcpyDeviceToHost)); - } - - for (unsigned int i = 0; i < loops; i++) { - for (int dev = 0; dev < 2; dev++) { - std::cout << "+++++++++++++++++ Device " << (d + dev); - std::cout << "+++++++++++++++++" << std::endl; - for (unsigned int j = 0; j < requested_blocks; j++) { - std::cout << "Buffer entry " << (i * warps + j); - std::cout << " (written by warp " << j << ")"; - std::cout << " is " << host_buffer[dev][i * requested_blocks + j]; - std::cout << std::endl; - } - } - std::cout << "==========================\n"; - } - for (unsigned int dev = 0; dev < 2; dev++) { - std::cout << "Testing output from device " << (d + dev) << std::endl; - int local_ret_val = verify_barrier_buffer(loops, requested_blocks, - host_buffer[dev], 2); - if (local_ret_val == -1) { - flag = 1; - } - } - - std::cout << std::endl << "The multi-GPU shared updates contain:"; - std::cout << std::endl; - for (int i = 0; i < 2; i++) { - std::cout << "Entry " << i << ": "; - std::cout << global_array[i] << std::endl; - } - for (int dev = 0; dev < 2; dev++) { - std::cout << "Testing multi-GPU output for entry " << (d + dev); - std::cout << std::endl; - int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]); - if (local_ret_val) { - flag = 1; - } - } - for (int k = 0; k < 2; ++k) { - HIPCHECK(hipFree(kernel_buffer[k])); - HIPCHECK(hipFree(kernel_atomic[k])); - HIPCHECK(hipStreamDestroy(streams[k])); - free(host_buffer[k]); - } - } - if (flag == 1) { - failed(""); - } else { - passed(); - } -} diff --git a/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp b/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp index f7a9dac703..874f8bc44c 100644 --- a/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp +++ b/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp @@ -1,173 +1,173 @@ -/* - * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/* - * Test to compare - * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute ** - * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci ** - */ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc - * HIT_END - */ - -#include "test_common.h" -#define MAX_DEVICE_LENGTH 20 - -static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) { - for (int i = 0; i < deviceCount; i++) { - HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i)); - } - return true; -} - -bool comparePciBusIDWithHipDeviceGetAttribute() { - bool testResult = true; - int deviceCount = 0; - HIPCHECK(hipGetDeviceCount(&deviceCount)); - HIPASSERT(deviceCount != 0); - printf("No.of gpus in the system: %d\n", deviceCount); - char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - - getPciBusId(deviceCount, hipDeviceList); - - for (int i = 0; i < deviceCount; i++) { - int pciBusID = -1; - int pciDeviceID = -1; - int pciDomainID = -1; - int tempPciBusId = -1; - sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID, - &pciDeviceID); - HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i)); - if (pciBusID != tempPciBusId) { - testResult = false; - printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from " - "hipDeviceGetAttribute for gpu %d\n", i); - } - } - - printf("pciBusID output of both hipDeviceGetPCIBusId and" - " hipDeviceGetAttribute matched for all gpus\n"); - return testResult; -} - -bool compareHipDeviceGetPCIBusIdWithLspci() { - FILE *fpipe; - bool testResult = false; - - { - // Check if lspci is installed, if not, don't proceed - char const *cmd = "lspci --version"; - char *lspciCheck; - char temp[20]; - fpipe = popen(cmd, "r"); - - if (fpipe == nullptr) { - printf("Unable to create command file\n"); - return testResult; - } - - lspciCheck = fgets(temp, 20, fpipe); - pclose(fpipe); - - if (!lspciCheck) { - printf("lspci not found. Skipping the test\n"); - return true; - } - } - - int deviceCount = 0; - HIPCHECK(hipGetDeviceCount(&deviceCount)); - HIPASSERT(deviceCount != 0); - printf("No.of gpus in the system: %d\n", deviceCount); - char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; - - getPciBusId(deviceCount, hipDeviceList); - - // Get lspci device list and compare with hip device list -#if defined(__CUDA_ARCH__) - char const *command = "lspci -D | grep controller | grep NVIDIA | " - "cut -d ' ' -f 1"; -#else - char const *command = "lspci -D | grep controller | grep AMD/ATI | " - "cut -d ' ' -f 1"; -#endif - fpipe = popen(command, "r"); - - if (fpipe == nullptr) { - printf("Unable to create command file\n"); - return testResult; - } - - int index = 0; - int deviceMatchCount = 0; - - while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) { - bool bMatchFound = false; - for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) { - if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) { - deviceMatchCount++; - bMatchFound = true; - } - } - if (bMatchFound == false) { - printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]); - } - index++; - } - - pclose(fpipe); - - if (deviceMatchCount == deviceCount) { - printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} " - "matched for all gpus\n"); - testResult = true; - } else { - printf("Mismatch in number GPUs reported by HIP with lscpi\n"); - } - return testResult; -} - -int main(int argc, char* argv[]) { - bool testResult = true; - HipTest::parseStandardArguments(argc, argv, true); - - if (p_tests & 0x1) { - testResult &= comparePciBusIDWithHipDeviceGetAttribute(); - } - - if (p_tests & 0x2) { -#ifdef __unix__ - testResult &= compareHipDeviceGetPCIBusIdWithLspci(); -#else - printf("Detected non-linux OS. Skipping the test\n"); -#endif - } - - if (testResult) { - passed(); - } else { - failed("one or more tests failed\n"); - } -} +/* + * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* + * Test to compare + * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute ** + * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci ** + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ + +#include "test_common.h" +#define MAX_DEVICE_LENGTH 20 + +static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) { + for (int i = 0; i < deviceCount; i++) { + HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i)); + } + return true; +} + +bool comparePciBusIDWithHipDeviceGetAttribute() { + bool testResult = true; + int deviceCount = 0; + HIPCHECK(hipGetDeviceCount(&deviceCount)); + HIPASSERT(deviceCount != 0); + printf("No.of gpus in the system: %d\n", deviceCount); + char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + + getPciBusId(deviceCount, hipDeviceList); + + for (int i = 0; i < deviceCount; i++) { + int pciBusID = -1; + int pciDeviceID = -1; + int pciDomainID = -1; + int tempPciBusId = -1; + sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID, + &pciDeviceID); + HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i)); + if (pciBusID != tempPciBusId) { + testResult = false; + printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from " + "hipDeviceGetAttribute for gpu %d\n", i); + } + } + + printf("pciBusID output of both hipDeviceGetPCIBusId and" + " hipDeviceGetAttribute matched for all gpus\n"); + return testResult; +} + +bool compareHipDeviceGetPCIBusIdWithLspci() { + FILE *fpipe; + bool testResult = false; + + { + // Check if lspci is installed, if not, don't proceed + char const *cmd = "lspci --version"; + char *lspciCheck; + char temp[20]; + fpipe = popen(cmd, "r"); + + if (fpipe == nullptr) { + printf("Unable to create command file\n"); + return testResult; + } + + lspciCheck = fgets(temp, 20, fpipe); + pclose(fpipe); + + if (!lspciCheck) { + printf("lspci not found. Skipping the test\n"); + return true; + } + } + + int deviceCount = 0; + HIPCHECK(hipGetDeviceCount(&deviceCount)); + HIPASSERT(deviceCount != 0); + printf("No.of gpus in the system: %d\n", deviceCount); + char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH]; + + getPciBusId(deviceCount, hipDeviceList); + + // Get lspci device list and compare with hip device list +#if defined(__CUDA_ARCH__) + char const *command = "lspci -D | grep controller | grep NVIDIA | " + "cut -d ' ' -f 1"; +#else + char const *command = "lspci -D | grep controller | grep AMD/ATI | " + "cut -d ' ' -f 1"; +#endif + fpipe = popen(command, "r"); + + if (fpipe == nullptr) { + printf("Unable to create command file\n"); + return testResult; + } + + int index = 0; + int deviceMatchCount = 0; + + while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) { + bool bMatchFound = false; + for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) { + if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) { + deviceMatchCount++; + bMatchFound = true; + } + } + if (bMatchFound == false) { + printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]); + } + index++; + } + + pclose(fpipe); + + if (deviceMatchCount == deviceCount) { + printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} " + "matched for all gpus\n"); + testResult = true; + } else { + printf("Mismatch in number GPUs reported by HIP with lscpi\n"); + } + return testResult; +} + +int main(int argc, char* argv[]) { + bool testResult = true; + HipTest::parseStandardArguments(argc, argv, true); + + if (p_tests & 0x1) { + testResult &= comparePciBusIDWithHipDeviceGetAttribute(); + } + + if (p_tests & 0x2) { +#ifdef __unix__ + testResult &= compareHipDeviceGetPCIBusIdWithLspci(); +#else + printf("Detected non-linux OS. Skipping the test\n"); +#endif + } + + if (testResult) { + passed(); + } else { + failed("one or more tests failed\n"); + } +} diff --git a/tests/src/runtimeApi/device/hipSetGetDevice.cpp b/tests/src/runtimeApi/device/hipSetGetDevice.cpp index 6c703de867..4224c974b3 100644 --- a/tests/src/runtimeApi/device/hipSetGetDevice.cpp +++ b/tests/src/runtimeApi/device/hipSetGetDevice.cpp @@ -25,7 +25,7 @@ */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST_NAMED: %t hipSetGetDevice-invalidDevice * TEST_NAMED: %t hipSetGetDevice-allValidDevice * TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1 diff --git a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp b/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp deleted file mode 100644 index 00c01ab1cc..0000000000 --- a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp +++ /dev/null @@ -1,227 +0,0 @@ -/* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include -#include -#include -#include "test_common.h" - -#ifdef __linux__ -sem_t *sem_ob1 = NULL, *sem_ob2 = NULL; -typedef struct mem_handle { - int device; - hipIpcMemHandle_t memHandle; - bool IfTestPassed; -} hip_ipc_t; - -class IpcMemHandleTest { - public: - bool InitFlag = true; - hip_ipc_t *shrd_mem = NULL; - pid_t pid; - size_t N = 1024; - size_t Nbytes = N * sizeof(int); - int *A_d = NULL, out = 0; - int *A_h, *C_h; - int Num_devices = 0, Data_mismatch, CanAccessPeer = 0; - int *Ad1 = NULL, *Ad2 = NULL; - IpcMemHandleTest(); - bool Test(); - ~IpcMemHandleTest(); -}; - - -bool IpcMemHandleTest::Test() { - if (InitFlag == false) { - // Abort the test if the initialization fails - printf("Resource initialization failed. Hence test skipped!"); - return false; - } - pid = fork(); - if (pid != 0) { - // Parent process - HIPCHECK(hipGetDeviceCount(&Num_devices)); - for (int i = 0; i < Num_devices; ++i) { - if (shrd_mem->IfTestPassed == true) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipMalloc(&A_d, Nbytes)); - HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle, - A_d)); - HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - shrd_mem->device = i; - if ((out=sem_post(sem_ob1)) == -1) { - // Need to use inline function to release resources. - shrd_mem->IfTestPassed = false; - failed("sem_post() call failed in parent process."); - } - if ((out=sem_wait(sem_ob2)) == -1) { - shrd_mem->IfTestPassed = false; - failed("sem_wait() call failed in parent process."); - } - HIPCHECK(hipFree(A_d)); - } - } - } else { - // Child process - HIPCHECK(hipGetDeviceCount(&Num_devices)); - for (int j = 0; j < Num_devices; ++j) { - if ((out=sem_wait(sem_ob1)) == -1) { - shrd_mem->IfTestPassed = false; - printf("sem_wait() call failed in child process."); - if ((out=sem_post(sem_ob2)) == -1) { - printf("sem_post() call on sem_ob2 failed"); - exit(1); - } - } - for (int i = 0; i < Num_devices; ++i) { - Data_mismatch = 0; - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipMalloc(&Ad2, Nbytes)); - HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle, - hipIpcMemLazyEnablePeerAccess)); - HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device)); - if (CanAccessPeer == 1) { - HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice)); - HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice)); - for (int i = 0; i < N; ++i) { - if (C_h[i] != 123) - Data_mismatch++; - } - if (Data_mismatch != 0) { - printf("Data mismatch found when data copied from Ipc memhandle"); - printf(" to Device: %d\n", i); - shrd_mem->IfTestPassed = false; - } - memset(reinterpret_cast(C_h), 0, Nbytes); - // Checking if the data obtained from Ipc shared memory is consistent - HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost)); - for (int i = 0; i < N; ++i) { - if (C_h[i] != 123) - Data_mismatch++; - } - if (Data_mismatch != 0) { - printf("Data mismatch found when data copied from Ipc memhandle"); - printf(" Host.\n"); - shrd_mem->IfTestPassed = false; - } - } - HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast(Ad1))); - } - HIPCHECK(hipFree(Ad2)); - if ((out=sem_post(sem_ob2)) == -1) { - shrd_mem->IfTestPassed = false; - printf("sem_post() call on sem_ob2 failed"); - exit(1); - } - } - exit(0); - } - - if ((out = sem_unlink("/my-sem-object1")) == -1) { - printf("sem_unlink() call on /my-sem-object1 failed"); - } - if ((out = sem_unlink("/my-sem-object2")) == -1) { - printf("sem_unlink() call on /my-sem-object2 failed"); - } - int status; - waitpid(pid, &status, 0); - if (shrd_mem->IfTestPassed == false) { - return false; - } else { - return true; - } -} - -IpcMemHandleTest::IpcMemHandleTest() { - std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*"; - int res = system(cmd_line.c_str()); - if (res == -1) { - InitFlag = false; - printf("System call to remove existing shared objects failed!"); - } - int out; - if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) == - SEM_FAILED) { - InitFlag = false; - printf("Initialization of 1st semaphore object failed"); - } - if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) == - SEM_FAILED) { - InitFlag = false; - printf("Initialization of 2nd semaphore object failed"); - } - - shrd_mem = reinterpret_cast(mmap(NULL, sizeof(hip_ipc_t), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0, 0)); - if (shrd_mem == NULL) { - InitFlag = false; - printf("mmap() call failed!"); - } - shrd_mem->IfTestPassed = true; - A_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - for (size_t i = 0; i < N; i++) { - A_h[i] = 123; - } -} - -IpcMemHandleTest::~IpcMemHandleTest() { - munmap(shrd_mem, sizeof(hip_ipc_t)); - HIPCHECK(hipFree((A_d))); - free(A_h); - free(C_h); - HIPCHECK(hipFree((Ad1))); - HIPCHECK(hipFree((Ad2))); -} -#endif - -int main() { - bool IfTestPassed = true; - // The following program spawns a child process and does the following - // Parent iterate through each device, create memory -- create hipIpcMemhandle - // stores the mem handle in mmaped memory, release the child using sem_post() - // and wait for child to release itself(parent process) - // child process: - // Child process get the ipc mem handle using hipIpcOpenMemHandle - // Iterate through all the available gpus and do Device to Device copies - // and check for data consistencies and close the hipIpcCloseMemHandle - // release the parent and wait for parent to release itself(child) -#ifdef __linux__ - IpcMemHandleTest obj; - IfTestPassed = obj.Test(); -#else - printf("This is not a Linux platform. Hence Skipping the test!\n"); - IfTestPassed = true; -#endif - if (IfTestPassed == false) { - failed(""); - } - passed(); -} diff --git a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp b/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp deleted file mode 100644 index 934c364b6b..0000000000 --- a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp +++ /dev/null @@ -1,487 +0,0 @@ -/* -Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/** -Testcase Scenarios : - - (TestCase 1):: - 1) Test hipMalloc() api passing zero size and confirming *ptr returning - nullptr. Also pass nullptr to hipFree() api. - 2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate - error is returned. - 3) Check for hipMalloc() error code, passing invalid/null pointer. - - (TestCase 2):: - 4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation - with adequate number of iterations and later test for kernel execution on - default gpu. - 5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks - keeping maximum number of iterations and then run kernel code on default - gpu, perfom data validation. - - (TestCase 3):: - 6) Check hipMalloc() api adaptability when app creates small chunks of memory - continuously, stores it for later use and then frees it at later point - of time. - - (TestCase 4):: - 7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child - processes, validate the results. - - (TestCase 5):: - 8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple - child processes. Validate buffers allocated after running kernel code. - - (TestCase 6):: - 9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from - multiple threads and regress the api. - - (TestCases 2, 3, 4, 5, 6):: - 10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc() - api. Check for any possible memory leaks. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 - * TEST_NAMED: %t hipMalloc_ArgValidation --tests 1 - * TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2 - * TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3 - * TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4 - * TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5 - * TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6 - * HIT_END - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include "test_common.h" - -/* Max alloc/free iterations for bigger chunks */ -#define MAX_ALLOCFREE_BC (10000) - -/* Buffer size for alloc/free cycles */ -#define BUFF_SIZE_AF (5*1024*1024) - -/* Max alloc/free iterations for smaller chunks */ -#define MAX_ALLOCFREE_SC (5000000) - -/* Max alloc and pool iterations (TBD) */ -#define MAX_ALLOCPOOL_ITER (2000000) - -/** - * Validates data consitency on supplied gpu - */ -bool validateMemoryOnGPU(int gpu) { - size_t Nbytes = N * sizeof(int); - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; - size_t prevAvl, prevTot, curAvl, curTot; - bool TestPassed = true; - - HIPCHECK(hipSetDevice(gpu)); - HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot)); - HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); - - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - - HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), - 0, 0, static_cast(A_d), - static_cast(B_d), C_d, N); - - HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); - - if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) { - printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid()); - } else { - printf("%s : Validation FAILED for gpu %d from pid %d\n", - __func__, gpu, getpid()); - TestPassed &= false; - } - - HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); - HIPCHECK(hipMemGetInfo(&curAvl, &curTot)); - - if ((prevAvl != curAvl) || (prevTot != curTot)) { - printf("%s : Memory allocation mismatch observed." - "Possible memory leak.", __func__); - TestPassed &= false; - } - - return TestPassed; -} - -/** - * Fetches Gpu device count - */ -void getDeviceCount(int *pdevCnt) { -#ifdef __linux__ - int fd[2], val = 0; - pid_t childpid; - - // create pipe descriptors - pipe(fd); - - // disable visible_devices env from shell - unsetenv("ROCR_VISIBLE_DEVICES"); - unsetenv("HIP_VISIBLE_DEVICES"); - - childpid = fork(); - - if (childpid > 0) { // Parent - close(fd[1]); - // parent will wait to read the device cnt - read(fd[0], &val, sizeof(val)); - - // close the read-descriptor - close(fd[0]); - - // wait for child exit - wait(NULL); - - *pdevCnt = val; - } else if (!childpid) { // Child - int devCnt = 1; - // writing only, no need for read-descriptor - close(fd[0]); - - HIPCHECK(hipGetDeviceCount(&devCnt)); - // send the value on the write-descriptor: - write(fd[1], &devCnt, sizeof(devCnt)); - - // close the write descriptor: - close(fd[1]); - exit(0); - } else { // failure - *pdevCnt = 1; - return; - } - -#else - HIPCHECK(hipGetDeviceCount(pdevCnt)); -#endif -} - -/** - * Regress memory allocation and free in loop - */ -bool regressAllocInLoop(int gpu) { - bool TestPassed = true; - size_t tot, avail, ptot, pavail; - int i = 0; - int *ptr; - - HIPCHECK(hipSetDevice(gpu)); - - // Exercise allocation in loop with bigger chunks - for (i = 0; i < MAX_ALLOCFREE_BC; i++) { - size_t numBytes = BUFF_SIZE_AF; - - HIPCHECK(hipMemGetInfo(&pavail, &ptot)); - HIPCHECK(hipMalloc(&ptr, numBytes)); - HIPCHECK(hipMemGetInfo(&avail, &tot)); - - if (pavail-avail != numBytes) { - printf("LoopAllocation : Memory allocation of %6.2fMB" - "not matching with hipMemGetInfo - FAIL\n", - numBytes/(1024.0*1024.0)); - TestPassed &= false; - HIPCHECK(hipFree(ptr)); - break; - } - - HIPCHECK(hipFree(ptr)); - } - - // Exercise allocation in loop with smaller chunks and max iters - HIPCHECK(hipMemGetInfo(&pavail, &ptot)); - - for (i = 0; i < MAX_ALLOCFREE_SC; i++) { - size_t numBytes = 16; - - HIPCHECK(hipMalloc(&ptr, numBytes)); - - HIPCHECK(hipFree(ptr)); - } - - HIPCHECK(hipMemGetInfo(&avail, &tot)); - - if ((pavail != avail) || (ptot != tot)) { - printf("LoopAllocation : Memory allocation mismatch observed." - "Possible memory leak."); - TestPassed &= false; - } - - return TestPassed; -} - -/* - * Thread func to regress alloc and check data consistency - */ - -std::atomic g_thTestPassed(true); - -void threadFunc(int gpu) { - g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu); - g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu); - - printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load()); -} - -int main(int argc, char* argv[]) { - HipTest::parseStandardArguments(argc, argv, true); - - if (p_tests == 1) { // Arg validation - // Test hipMalloc for zero size - bool TestPassed = true; - int *ptr; - - HIPCHECK(hipMalloc(&ptr, 0)); - - // ptr expected to be reset to null ptr - if (ptr) { - printf("ArgValidation : Failed in zero size test\n"); - TestPassed &= false; - } - - // Free null ptr - HIPCHECK(hipFree(ptr)); - - // Test hipMalloc for invalid arguments - hipError_t ret; - - if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) { - printf("ArgValidation : Inappropritate error value returned" - " for invalid argument. Error: '%s'(%d)\n", - hipGetErrorString(ret), ret); - TestPassed &= false; - } - - // Test hipMalloc for Maximum value of size_t - if ((ret = hipMalloc(&ptr, std::numeric_limits::max())) - != hipErrorMemoryAllocation) { - printf("ArgValidation : Invalid error returned for max size_t." - " Error: '%s'(%d)\n", hipGetErrorString(ret), ret); - TestPassed &= false; - } - - if (TestPassed) { - passed(); - } else { - failed("hipMalloc ArgumentValidation Failure!"); - } - - } else if (p_tests == 2) { // Loop Regression Alloc/Free Cycle - bool TestPassed = true; - - TestPassed &= regressAllocInLoop(0); - TestPassed &= validateMemoryOnGPU(0); - - if (TestPassed) { - passed(); - } else { - failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!"); - } - - } else if (p_tests == 3) { // Loop Regression Alloc and Pool - size_t avail, tot, pavail, ptot; - bool TestPassed = true; - hipError_t err; - int *ptr; - - std::vector ptrlist; - - HIPCHECK(hipMemGetInfo(&pavail, &ptot)); - - // Allocate small chunks of memory million times - for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) { // Iterations TBD - if ((err = hipMalloc(&ptr, 10)) != hipSuccess) { - HIPCHECK(hipMemGetInfo(&avail, &tot)); - - printf("Loop regression pool allocation failure. " - "Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n", - tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err)); - - TestPassed &= false; - break; - } - - // Store pointers allocated to emulate memory pool of app - ptrlist.push_back(ptr); - } - - // Free ptrs at later point of time - for ( auto &t : ptrlist ) { - HIPCHECK(hipFree(t)); - } - - HIPCHECK(hipMemGetInfo(&avail, &tot)); - - TestPassed &= validateMemoryOnGPU(0); - - if ((pavail != avail) || (ptot != tot)) { - printf("%s : Memory allocation mismatch observed. Possible memory leak.", - __func__); - TestPassed &= false; - } - - if (TestPassed) { - passed(); - } else { - failed("hipMalloc_LoopRegression_AllocPool failure!"); - } - - } else if (p_tests == 4) { - bool TestPassed = true; - -#ifdef __linux__ - // Parallel execution of parent and child on gpu0 - int pid; - - if ((pid = fork()) < 0) { - printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid); - TestPassed &= false; - - } else if (!pid) { // Child process - bool TestPassedChild = true; - - TestPassedChild = validateMemoryOnGPU(0); - - if (TestPassedChild) { - exit(0); // child exit with success status - } else { - printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid()); - exit(1); // child exit with failure status - } - - } else { // Parent process - int exitStatus; - TestPassed = validateMemoryOnGPU(0); - - pid = wait(&exitStatus); - if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) ) - TestPassed &= false; - } -#else - printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n"); -#endif - - // TC scenarios specific to linux - // are treated as pass in windows. - if (TestPassed) { - passed(); - } else { - failed("hipMallocChild_Concurrency_DefaultGpu Failed!"); - } - - } else if (p_tests == 5) { - bool TestPassed = true; -#ifdef __linux__ - // Parallel execution on multiple gpus from different child processes - int devCnt = 1, pid = 0, cumStatus = 0; - - // Get GPU count - getDeviceCount(&devCnt); - - // Spawn child for each GPU - for (int gpu = 0; gpu < devCnt; gpu++) { - if ((pid = fork()) < 0) { - printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid); - failed("Test Failed!"); - - } else if (!pid) { // Child process - bool TestPassedChild = true; - TestPassedChild = validateMemoryOnGPU(gpu); - - if (TestPassedChild) { - exit(0); // child exit with success status - } else { - printf("Child_Concurrency_MultiGpu : childpid %d failed\n", - getpid()); - exit(1); // child exit with failure status - } - } - } - - // Parent shall wait for child to complete - for (int i = 0; i < devCnt; i++) { - int pidwait = 0, exitStatus; - pidwait = wait(&exitStatus); - - if (pidwait < 0) { - TestPassed &= false; - break; - } - - cumStatus |= WEXITSTATUS(exitStatus); - } - - // Cummulative status of all child - if (cumStatus) { - TestPassed &= false; - } - -#else - printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n"); -#endif - - - // TC scenarios specific to linux - // are treated as pass in windows. - if (TestPassed) { - passed(); - } else { - failed("hipMallocChild_Concurrency_MultiGpu Failed!"); - } - - } else if (p_tests == 6) { // Multithreaded multiple gpu execution - std::vector threadlist; - int devCnt = 1; - - // Get GPU count - getDeviceCount(&devCnt); - - - for (int i = 0; i < devCnt; i++) { - threadlist.push_back(std::thread(threadFunc, i)); - } - - for (auto &t : threadlist) { - t.join(); - } - - if (g_thTestPassed) { - passed(); - } else { - failed("hipMalloc_MultiThreaded_MultiGpu Failed!"); - } - } else { - failed("Didnt receive any valid option. Try options 1 to 6\n"); - } -} - diff --git a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp deleted file mode 100644 index 25820e2305..0000000000 --- a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp +++ /dev/null @@ -1,423 +0,0 @@ -/* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* Test 6 is disabled */ -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 - * TEST_NAMED: %t hipMallocManaged1 --tests 1 - * TEST_NAMED: %t hipMallocManaged2 --tests 2 - * TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3 - * TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4 - * TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc - * TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc - * HIT_END - */ - -#include -#include "test_common.h" -#define N 1048576 // equals to (1024*1024) -#define INIT_VAL 123 - -/* - * Kernel function to perform addition operation. - */ -template -__global__ void -vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) { - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x; - - for (size_t i = offset; i < NUM_ELMTS; i += stride) { - Ad2[i] = Ad1[i] + Ad1[i]; - } -} - -// The following Test case tests the following scenario: -// A large chunk of hipMallocManaged() memory(Hmm) is created -// Equal parts of Hmm is accessed on available gpus and -// kernel is launched on acessed chunk of hmm memory -// and checks if there are any inconsistencies or access issues -bool MultiChunkMultiDevice(int NumDevices) { - std::atomic DataMismatch{0}; - bool IfTestPassed = true; - int Counter = 0; - unsigned int NUM_ELMS = (1024 * 1024); - float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS]; - hipStream_t stream[NumDevices]; - for (int Oloop = 0; Oloop < NumDevices; ++Oloop) { - HIPCHECK(hipSetDevice(Oloop)); - HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float))); - HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float))); - HIPCHECK(hipStreamCreate(&stream[Oloop])); - } - HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float)))); - for (int i = 0; i < NumDevices; ++i) { - for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) { - Hmm[Counter] = INIT_VAL + i; - } - } - const unsigned threadsPerBlock = 256; - const unsigned blocks = (NUM_ELMS + 255)/256; - for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) { - vector_sum <<>> - (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS); - } - HIPCHECK(hipDeviceSynchronize()); - for (int m = 0; m < NumDevices; ++m) { - HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), - hipMemcpyDeviceToHost)); - for (int n = 0; n < NUM_ELMS; ++n) { - if (Ah[n] != ((INIT_VAL + m) * 2)) { - DataMismatch++; - } - } - memset(reinterpret_cast(Ah), 0, NUM_ELMS * sizeof(float)); - } - if (DataMismatch.load() != 0) { - printf("MultiChunkMultiDevice: Mismatch observed!\n"); - IfTestPassed = false; - } - for (int i = 0; i < NumDevices; ++i) { - HIPCHECK(hipFree(Ad[i])); - HIPCHECK(hipStreamDestroy(stream[i])); - } - HIPCHECK(hipFree(Hmm)); - free(Ah); - return IfTestPassed; -} - -// The following Test case tests the following scenario: -// A large chunk of hipMallocManaged() memory(Hmm) is created -// Equal parts of Hmm is accessed and -// kernel is launched on acessed chunk of hmm memory -// and checks if there are any inconsistencies or access issues - -bool MultiChunkSingleDevice(int NumDevices) { - std::atomic DataMismatch{0}; - int Chunks = 4, Counter = 0; - bool IfTestPassed = true; - unsigned int NUM_ELMS = (1024 * 1024); - float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS]; - hipStream_t stream[Chunks]; - for (int i = 0; i < Chunks; ++i) { - HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float))); - HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float))); - HIPCHECK(hipStreamCreate(&stream[i])); - } - HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float)))); - for (int i = 0; i < Chunks; ++i) { - for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) { - Hmm[Counter] = (INIT_VAL + i); - } - } - const unsigned threadsPerBlock = 256; - const unsigned blocks = (NUM_ELMS + 255)/256; - for (int k = 0; k < Chunks; ++k) { - vector_sum <<>> - (&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS); - } - HIPCHECK(hipDeviceSynchronize()); - for (int m = 0; m < Chunks; ++m) { - HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), - hipMemcpyDeviceToHost)); - for (int n = 0; n < NUM_ELMS; ++n) { - if (Ah[n] != ((INIT_VAL + m) * 2)) { - DataMismatch++; - } - } - } - if (DataMismatch.load() != 0) { - printf("MultiChunkSingleDevice: Mismatch observed!\n"); - IfTestPassed = false; - } - for (int i = 0; i < Chunks; ++i) { - HIPCHECK(hipFree(Ad[i])); - HIPCHECK(hipStreamDestroy(stream[i])); - } - HIPCHECK(hipFree(Hmm)); - free(Ah); - return IfTestPassed; -} - -// The following tests oversubscription hipMallocManaged() api -// Currently disabled. -bool TestOversubscriptionMallocManaged(int NumDevices) { - bool IfTestPassed = true; - hipError_t err; - void *A = NULL; - size_t total = 0, free = 0; - HIPCHECK(hipMemGetInfo(&free, &total)); - // ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook - // into how out of memory can be tested. - // Demanding more mem size than available - err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal); - if (hipErrorOutOfMemory != err) { - printf("hipMallocManaged: Returned %s for size value > device memory\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - return IfTestPassed; -} - -// The following test does negative testing of hipMallocManaged() api -// by passing invalid values and check if the behavior is as expected -bool NegativeTestsMallocManaged(int NumDevices) { - bool IfTestPassed = true; - hipError_t err; - void *A = NULL; - size_t total = 0, free = 0; - HIPCHECK(hipMemGetInfo(&free, &total)); - - err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s when devPtr is null\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - err = hipMallocManaged(&A, 0, hipMemAttachGlobal); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s when size is 0\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - err = hipMallocManaged(NULL, 0, hipMemAttachGlobal); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - -#ifdef __HIP_PLATFORM_HCC__ - // The flag hipMemAttachHost is currently not supported therefore - // api should return "hipErrorInvalidValue" for now - err = hipMallocManaged(&A, 1024, hipMemAttachHost); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n", - hipGetErrorString(err)); - IfTestPassed = false; - } -#endif // __HIP_PLATFORM_HCC__ - - err = hipMallocManaged(NULL, 0, 0); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s when params are null, 0, 0\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - err = hipMallocManaged(&A, 1024, 145); - if (hipErrorInvalidValue != err) { - printf("hipMallocManaged: Returned %s when flag param is numerical 145\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - err = hipMallocManaged(&A, -10, hipMemAttachGlobal); - if (hipErrorOutOfMemory != err) { - printf("hipMallocManaged: Returned %s for negative size value.\n", - hipGetErrorString(err)); - IfTestPassed = false; - } - - return IfTestPassed; -} - - -// Allocate two pointers using hipMallocManaged(), initialize, -// then launch kernel using these pointers directly and -// later validate the content without using any Memcpy. -template -bool TestMallocManaged2(int NumDevices) { - bool IfTestPassed = true; - T *Hmm1 = NULL, *Hmm2 = NULL; - - for (int i = 0; i < NumDevices; ++i) { - HIPCHECK(hipSetDevice(i)); - std::atomic DataMismatch{0}; - HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T))); - HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T))); - for (int m = 0; m < N; ++m) { - Hmm1[m] = m; - Hmm2[m] = 0; - } - const unsigned threadsPerBlock = 256; - const unsigned blocks = (N + 255)/256; - // Kernel launch - vector_sum <<>> (Hmm1, Hmm2, N); - HIPCHECK(hipDeviceSynchronize()); - for (int v = 0; v < N; ++v) { - if (Hmm2[v] != (v + v)) { - DataMismatch++; - } - } - if (DataMismatch.load() != 0) { - IfTestPassed = false; - } - HIPCHECK(hipFree(Hmm1)); - HIPCHECK(hipFree(Hmm2)); - } - return IfTestPassed; -} - -// In the following test, a memory is created using hipMallocManaged() by -// setting a device and verified if it is accessible when the context is set -// to all other devices. This include verification and Device two Device -// transfers and kernel launch o discover if there any access issues. - -template -bool TestMallocManaged1(int NumDevices) { - std::atomic DataMismatch; - bool TestPassed = true; - T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL; - - for (int i =0; i < N; ++i) { - Ah1[i] = INIT_VAL; - Ah2[i] = 0; - } - for (int Oloop = 0; Oloop < NumDevices; ++Oloop) { - DataMismatch = 0; - HIPCHECK(hipSetDevice(Oloop)); - HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T))); - for (int Iloop = 0; Iloop < NumDevices; ++Iloop) { - HIPCHECK(hipSetDevice(Iloop)); - HIPCHECK(hipMalloc(&Ad, N * sizeof(T))); - // Copy data from host to hipMallocMananged memory and verify - HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice)); - for (int v = 0; v < N; ++v) { - if (Hmm[v] != INIT_VAL) { - DataMismatch++; - } - } - if (DataMismatch.load() != 0) { - printf("Mismatch is observed with host data at device %d", Iloop); - printf(" while hipMallocManaged memory set to the device %d\n", Oloop); - TestPassed = false; - DataMismatch = 0; - } - // Executing D2D transfer with hipMallocManaged memory and verify - HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice)); - HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost)); - for (int k = 0; k < N; ++k) { - if (Ah2[k] != INIT_VAL) { - DataMismatch++; - } - } - if (DataMismatch.load() != 0) { - printf("Mismatch is observed with D2D transfer at device %d\n", Iloop); - printf(" while hipMallocManaged memory set to the device %d\n", Oloop); - TestPassed = false; - DataMismatch = 0; - } - HIPCHECK(hipMemset(Ad, 0, N * sizeof(T))); - const unsigned threadsPerBlock = 256; - const unsigned blocks = (N + 255)/256; - // Launching the kernel to check if there is any access issue with - // hipMallocManaged memory and local device's memory - vector_sum <<>> (Hmm, Ad, N); - hipDeviceSynchronize(); - HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost)); - for (int m = 0; m < N; ++m) { - if (Ah2[m] != 246) { - DataMismatch++; - } - } - if (DataMismatch.load() != 0) { - printf("Data Mismatch observed after kernel lch device %d\n", Iloop); - TestPassed = false; - DataMismatch = 0; - } - HIPCHECK(hipFree(Ad)); - } - HIPCHECK(hipFree(Hmm)); - } - free(Ah1); - free(Ah2); - return TestPassed; -} - -int main(int argc, char* argv[]) { - HipTest::parseStandardArguments(argc, argv, true); - - if ((p_tests <= 0) || (p_tests > 5)) { - failed("Valid arguments are from 1 to 5"); - } - - int NumDevices = 0; - HIPCHECK(hipGetDeviceCount(&NumDevices)); - bool TestStatus = true, OverAllStatus = true; - if (p_tests == 1) { - TestStatus = TestMallocManaged1(NumDevices); - if (!TestStatus) { - printf("Test Failed with float datatype.\n"); - OverAllStatus = false; - } - TestStatus = TestMallocManaged1(NumDevices); - if (!TestStatus) { - printf("Test Failed with int datatype.\n"); - OverAllStatus = false; - } - TestStatus = TestMallocManaged1(NumDevices); - if (!TestStatus) { - printf("Test Failed with unsigned char datatype.\n"); - OverAllStatus = false; - } - TestStatus = TestMallocManaged1(NumDevices); - if (!TestStatus) { - printf("Test Failed with double datatype.\n"); - OverAllStatus = false; - } - if (!OverAllStatus) { - failed(""); - } - } - if (p_tests == 2) { - TestStatus = TestMallocManaged2(NumDevices); - if (!TestStatus) { - failed("Test Failed with float datatype."); - } - } - if (p_tests == 3) { - TestStatus = NegativeTestsMallocManaged(NumDevices); - if (!TestStatus) { - failed("Negative Tests with hipMallocManaged() failed!."); - } - } - if (p_tests == 4) { - TestStatus = MultiChunkSingleDevice(NumDevices); - if (!TestStatus) { - failed("hipMallocManaged: MultiChunkSingleDevice test failed!"); - } - } - if (p_tests == 5) { - TestStatus = MultiChunkMultiDevice(NumDevices); - if (!TestStatus) { - failed("hipMallocManaged: MultiChunkMultiDevice test failed!"); - } - } - if (p_tests == 6) { - TestStatus = TestOversubscriptionMallocManaged(NumDevices); - if (!TestStatus) { - failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!"); - } - } - passed(); -} diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp index b3dc32810b..7e65c47244 100755 --- a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp @@ -75,9 +75,6 @@ int main() { HIPCHECK(hipFree(Z_d)); } else { std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"< -#include "test_common.h" - -int main() { - hipSharedMemConfig_t config; - HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL)); - HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config)); -} diff --git a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp similarity index 98% rename from tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp rename to tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp index 0e523f9d2e..8e67044eb0 100644 --- a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp @@ -20,7 +20,7 @@ THE SOFTWARE. // Simple test for hipLaunchCooperativeKernelMultiDevice API. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp similarity index 94% rename from tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp rename to tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index 6b1ba1c27a..e0fcd4108b 100644 --- a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -22,14 +22,15 @@ THE SOFTWARE. // Simple test for hipLaunchCooperativeKernel API. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" -#include "hip/hip_cooperative_groups.h" +#include "hip/hcc_detail/device_library_decls.h" +#include "hip/hcc_detail/hip_cooperative_groups.h" #include #include #include "test_common.h" @@ -128,7 +129,7 @@ int main() { params[3] = (void*)&dC; std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"; - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream)); + HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream)); HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost)); diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp index 6f649708b7..cc976ced42 100644 --- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp +++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index dbf58209fa..840e9b6975 100644 --- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp index 3ee2f4a050..07acc4a591 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp index 0d65a0f50b..3a25d3331c 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp @@ -21,7 +21,7 @@ // kernel. Verify that all the kernels queued are executed before the callback. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp index a98fbb87c7..a182c85010 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp @@ -21,7 +21,7 @@ // when hipStreamAddCallback() is called back to back multiple calls /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp index fb93268176..d21ea5da54 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp @@ -22,7 +22,7 @@ // by hipStreamAddCallback() api. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp b/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp index 5e9b75adee..2eef534ea4 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp @@ -22,7 +22,7 @@ // finish. Ideally Host thread should not wait for callback to finish. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamACb_order.cpp b/tests/src/runtimeApi/stream/hipStreamACb_order.cpp index f7d8a866f2..7b66441fa6 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_order.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_order.cpp @@ -18,7 +18,7 @@ * */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp b/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp index 49991eec20..8da2c2f8a5 100644 --- a/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp +++ b/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp @@ -19,7 +19,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST: %t * HIT_END */ diff --git a/tests/src/test_common.h b/tests/src/test_common.h old mode 100755 new mode 100644 index 21a4c45ac8..8897dc938e --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -41,6 +41,7 @@ THE SOFTWARE. #define HC __attribute__((hc)) + #define KNRM "\x1B[0m" #define KRED "\x1B[31m" #define KGRN "\x1B[32m" @@ -50,19 +51,6 @@ THE SOFTWARE. #define KCYN "\x1B[36m" #define KWHT "\x1B[37m" - // HIP Skip Return code set at cmake -#define HIP_SKIP_RETURN_CODE 127 -#define HIP_ENABLE_SKIP_TESTS 0 - -inline bool hip_skip_tests_enabled() { - return HIP_ENABLE_SKIP_TESTS; -} - -inline int hip_skip_retcode() { - // HIP Skip Return code set at cmake - return HIP_SKIP_RETURN_CODE; -} - #define passed() \ printf("%sPASSED!%s\n", KGRN, KNRM); \ exit(0); diff --git a/tests/unit/test_common.h b/tests/unit/test_common.h old mode 100755 new mode 100644 index ae6f1cba04..4b55c70164 --- a/tests/unit/test_common.h +++ b/tests/unit/test_common.h @@ -41,6 +41,7 @@ THE SOFTWARE. #define HC __attribute__((hc)) + #define KNRM "\x1B[0m" #define KRED "\x1B[31m" #define KGRN "\x1B[32m"