Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next""
This reverts commit28b17d3dbd. Reason for revert: <INSERT REASONING HERE> Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9 [ROCm/hip commit:ad2d55c144]
Этот коммит содержится в:
@@ -8,10 +8,15 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib (
|
||||
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
|
||||
if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH)
|
||||
set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
|
||||
if(NOT DEFINED __HIP_ENABLE_PCH)
|
||||
set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
|
||||
endif()
|
||||
|
||||
if(${__HIP_ENABLE_PCH})
|
||||
set(_pchStatus 1)
|
||||
else()
|
||||
set(_pchStatus 0)
|
||||
endif()
|
||||
#############################
|
||||
# Options
|
||||
#############################
|
||||
@@ -80,8 +85,8 @@ if(GIT_FOUND)
|
||||
|
||||
set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH})
|
||||
|
||||
if(DEFINED ENV{ROCM_BUILD_ID})
|
||||
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH})
|
||||
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
|
||||
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION})
|
||||
else()
|
||||
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH})
|
||||
endif()
|
||||
@@ -90,6 +95,36 @@ else()
|
||||
set(HIP_PACKAGING_VERSION_PATCH "0")
|
||||
endif()
|
||||
|
||||
## Debian package specific variables
|
||||
if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
|
||||
set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
|
||||
else()
|
||||
set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
|
||||
endif()
|
||||
message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
|
||||
|
||||
## RPM package specific variables
|
||||
if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
|
||||
set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
|
||||
else()
|
||||
set ( CPACK_RPM_PACKAGE_RELEASE "local" )
|
||||
endif()
|
||||
|
||||
## 'dist' breaks manual builds on debian systems due to empty Provides
|
||||
execute_process( COMMAND rpm --eval %{?dist}
|
||||
RESULT_VARIABLE PROC_RESULT
|
||||
OUTPUT_VARIABLE EVAL_RESULT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE )
|
||||
|
||||
if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
|
||||
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
|
||||
endif()
|
||||
message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
|
||||
add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
|
||||
add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
|
||||
add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
|
||||
|
||||
add_to_config(_versionInfo HIP_VERSION_MAJOR)
|
||||
add_to_config(_versionInfo HIP_VERSION_MINOR)
|
||||
add_to_config(_versionInfo HIP_VERSION_PATCH)
|
||||
@@ -102,7 +137,6 @@ else ()
|
||||
set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
|
||||
endif ()
|
||||
set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
|
||||
|
||||
if (DEFINED ENV{ROCM_RPATH})
|
||||
set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
|
||||
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
@@ -456,6 +490,7 @@ set(_versionInfoHeader
|
||||
#define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
|
||||
#define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}
|
||||
#define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
|
||||
#define __HIP_HAS_GET_PCH ${_pchStatus}\n
|
||||
#endif\n
|
||||
")
|
||||
file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
|
||||
@@ -669,8 +704,11 @@ endif()
|
||||
# Testing steps
|
||||
#############################
|
||||
# Target: test
|
||||
set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX})
|
||||
set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
|
||||
set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
if(HIP_PLATFORM STREQUAL "nvcc")
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
endif()
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
if(${RUN_HIT} EQUAL 0)
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
@@ -713,7 +751,7 @@ endif()
|
||||
#############################
|
||||
# Target: clang
|
||||
if(HIP_HIPCC_EXECUTABLE)
|
||||
add_custom_target(analyze
|
||||
add_custom_target(analyze
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
|
||||
WORKING_DIRECTORY ${HIP_SRC_PATH})
|
||||
if(CPPCHECK_EXE)
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
# Contributor Guidelines
|
||||
# Contributor Guidelines
|
||||
|
||||
## Make Tips
|
||||
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
|
||||
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
|
||||
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
|
||||
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
|
||||
set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example
|
||||
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=..
|
||||
make install
|
||||
|
||||
export HIP_PATH=
|
||||
export HIP_PATH=
|
||||
```
|
||||
|
||||
After making HIP, don't forget the "make install" step !
|
||||
@@ -21,118 +21,110 @@ After making HIP, don't forget the "make install" step !
|
||||
- Add a translation to the hipify-clang tool ; many examples abound.
|
||||
- For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
|
||||
- Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h.
|
||||
- These are typically headers
|
||||
- Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
|
||||
- Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve
|
||||
calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for
|
||||
hipStream_t).
|
||||
- These are typically headers
|
||||
- Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
|
||||
- Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
|
||||
|
||||
#### Testing HCC version
|
||||
In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time
|
||||
if the current HCC compiler is sufficiently new enough to support the desired feature. The `__hcc_workweek__` compiler
|
||||
define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0)
|
||||
(ie 15403, 16014, etc).
|
||||
The granularity is one day, so __hcc_workweek__ can only be used to distinguish compiler builds that are at least one day apart.
|
||||
## Check HIP-Clang version
|
||||
In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
|
||||
|
||||
HIP runtime version
|
||||
|
||||
```
|
||||
#ifdef __hcc_workweek_ > 16014
|
||||
// use cool new HCC feature here
|
||||
#endif
|
||||
> cat /opt/rocm/hip/bin/.hipVersion
|
||||
# Auto-generated by cmake
|
||||
HIP_VERSION_MAJOR=3
|
||||
HIP_VERSION_MINOR=9
|
||||
HIP_VERSION_PATCH=20345-519ef3f2
|
||||
```
|
||||
|
||||
Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4
|
||||
HIP-Clang compiler version
|
||||
|
||||
```
|
||||
> /opt/rocm/hcc/bin/hcc -v
|
||||
HCC clang version 3.5.0 (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn)
|
||||
$ /opt/rocm/llvm/bin/clang -v
|
||||
clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
|
||||
Target: x86_64-unknown-linux-gnu
|
||||
Thread model: posix
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1
|
||||
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
|
||||
InstalledDir: /opt/rocm/llvm/bin
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
|
||||
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
|
||||
Candidate multilib: .;@m64
|
||||
Candidate multilib: 32;@m32
|
||||
Candidate multilib: x32;@mx32
|
||||
Selected multilib: .;@m64
|
||||
```
|
||||
|
||||
The unix `date` command can print the HCC-format work-week for a specific date , ie:
|
||||
```
|
||||
> date --utc +%y%U%w -d 2015-11-09
|
||||
15451
|
||||
```
|
||||
|
||||
## Unit Testing Environment
|
||||
|
||||
HIP includes unit tests in the tests/src directory.
|
||||
HIP includes unit tests in the tests/src directory.
|
||||
When adding a new HIP feature, add a new unit test as well.
|
||||
See [tests/README.md](README.md) for more information.
|
||||
|
||||
## Development Flow
|
||||
It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source.
|
||||
Directed tests provide a great place to develop new features alongside the associated test.
|
||||
|
||||
Directed tests provide a great place to develop new features alongside the associated test.
|
||||
|
||||
For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
|
||||
- #1. Compile, link, and install HCC. See [Installation](README.md#Installation) notes.
|
||||
- #2. Relink the target application to include changes in the libhip_hcc.a file.
|
||||
- #1. Compile, link, and install HIP/ROCclr. See [Installation](README.md#Installation) notes.
|
||||
- #2. Relink the target application to include changes in HIP runtime file.
|
||||
|
||||
## Environment Variables
|
||||
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
|
||||
- **HCC_HOME** : Path to HCC compiler. Default /opt/rocm/hcc.
|
||||
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
|
||||
- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms. Default /opt/rocm/rocclr.
|
||||
- **HSA_PATH** : Path to HSA include, lib. Default /opt/rocm/hsa.
|
||||
- **CUDA_PATH* : On nvcc system, this points to root of CUDA installation.
|
||||
|
||||
### Contribution guidelines ###
|
||||
## Contribution guidelines ##
|
||||
|
||||
Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs.
|
||||
The HIP interface is designed to be very familiar for CUDA programmers.
|
||||
|
||||
Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
|
||||
Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
|
||||
|
||||
## Coding Guidelines (in brief)
|
||||
### Coding Guidelines (in brief)
|
||||
- Code Indentation:
|
||||
- Tabs should be expanded to spaces.
|
||||
- Use 4 spaces indentation.
|
||||
- Capitalization and Naming
|
||||
- Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator.
|
||||
- Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator.
|
||||
This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
|
||||
- Member variables should begin with a leading "_". This allows them to be easily distinguished from other variables or functions.
|
||||
|
||||
|
||||
- {} placement
|
||||
- For functions, the opening { should be placed on a new line.
|
||||
- For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else. Example
|
||||
'''
|
||||
if (foo) {
|
||||
doFoo()
|
||||
} else {
|
||||
doFoo()
|
||||
} else {
|
||||
doFooElse();
|
||||
}
|
||||
'''
|
||||
- namespace should be on same line as { and separated by a space.
|
||||
- Single-line if statement should still use {/} pair (even though C++ does not require).
|
||||
- Miscellaneous
|
||||
- All references in function parameter lists should be const.
|
||||
- All references in function parameter lists should be const.
|
||||
- "ihip" = internal hip structures. These should not be exposed through the HIP API.
|
||||
- Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs.
|
||||
- FIXME refers to a short-term bug that needs to be addressed.
|
||||
|
||||
- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized,
|
||||
and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match
|
||||
those of the parent function.
|
||||
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code
|
||||
those of the parent function.
|
||||
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code
|
||||
is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly.
|
||||
|
||||
- All HIP environment variables should begin with the keyword HIP_
|
||||
Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
|
||||
To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform .
|
||||
HIPCC or other tools may support additional environment variables which should follow the above convention.
|
||||
HIPCC or other tools may support additional environment variables which should follow the above convention.
|
||||
|
||||
|
||||
|
||||
#### Presubmit Testing:
|
||||
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
|
||||
### Presubmit Testing:
|
||||
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
|
||||
Ensure pass results match starting point:
|
||||
|
||||
```shell
|
||||
@@ -141,13 +133,13 @@ Ensure pass results match starting point:
|
||||
```
|
||||
|
||||
|
||||
#### Checkin messages
|
||||
### Checkin messages
|
||||
Follow existing best practice for writing a good Git commit message. Some tips:
|
||||
http://chris.beams.io/posts/git-commit/
|
||||
https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
|
||||
|
||||
In particular :
|
||||
- Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
|
||||
In particular :
|
||||
- Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
|
||||
Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
|
||||
- Subject should summarize the commit. Do not end subject with a period. Use a blank line
|
||||
after the subject.
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
#set -x
|
||||
|
||||
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
|
||||
LLVM_DIR="$1/../../../"
|
||||
tmp=/tmp/hip_pch.$$
|
||||
mkdir -p $tmp
|
||||
|
||||
@@ -47,12 +46,12 @@ __hip_pch_size:
|
||||
.long __hip_pch_size - __hip_pch
|
||||
EOF
|
||||
|
||||
$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
|
||||
$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
|
||||
|
||||
cat $tmp/hip_macros.h >> $tmp/pch.cui
|
||||
|
||||
$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
|
||||
|
||||
$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
|
||||
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
|
||||
|
||||
rm -rf $tmp
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#set -x
|
||||
|
||||
cat >/tmp/hip_macros.h <<EOF
|
||||
#define __device__ __attribute__((device))
|
||||
#define __host__ __attribute__((host))
|
||||
#define __global__ __attribute__((global))
|
||||
#define __constant__ __attribute__((constant))
|
||||
#define __shared__ __attribute__((shared))
|
||||
|
||||
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
|
||||
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
|
||||
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
|
||||
#define select_impl_(_1, _2, impl_, ...) impl_
|
||||
#define __launch_bounds__(...) \
|
||||
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) \
|
||||
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
|
||||
EOF
|
||||
|
||||
cat >/tmp/hip_pch.h <<EOF
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
EOF
|
||||
|
||||
/opt/rocm/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip /tmp/hip_pch.h -E >/tmp/pch.cui
|
||||
|
||||
cat /tmp/hip_macros.h >> /tmp/pch.cui
|
||||
|
||||
/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - </tmp/pch.cui
|
||||
@@ -803,7 +803,8 @@ if ($needHipHcc) {
|
||||
if ($linkType eq 0) {
|
||||
substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libamdhip64.a " ;
|
||||
} else {
|
||||
substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so ";
|
||||
#Currently in ROCm some of libraries are in lib64 and rest are in lib folder in centos.
|
||||
substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64 $HIP_LIB_PATH/libamdhip64.so ";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -247,4 +247,4 @@ The workaround is to explicitly add the keyword of "static" before any functions
|
||||
Product of block.x, block.y, and block.z should be less than 1024.
|
||||
|
||||
### Are __shfl_*_sync functions supported on HIP platform?
|
||||
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
|
||||
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
|
||||
@@ -54,7 +54,18 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
|
||||
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
|
||||
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
|
||||
|
||||
# set a default path for ROCM_PATH
|
||||
if(NOT DEFINED ROCM_PATH)
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
|
||||
#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
|
||||
if(DEFINED ENV{ROCM_PATH})
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
|
||||
if(HIP_COMPILER STREQUAL "clang")
|
||||
set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
|
||||
if(NOT HIP_CXX_COMPILER)
|
||||
set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
|
||||
endif()
|
||||
@@ -62,16 +73,12 @@ if(HIP_COMPILER STREQUAL "clang")
|
||||
execute_process(COMMAND ${HIP_CXX_COMPILER} --version
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
|
||||
if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[\t\r\n][\t\r\n]*([^\t\r\n])")
|
||||
set(HIP_CLANG_ROOT ${CMAKE_MATCH_1})
|
||||
else()
|
||||
set(HIP_CLANG_ROOT /opt/rocm/llvm)
|
||||
if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
|
||||
get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
|
||||
endif()
|
||||
elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" PATH)
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" PATH)
|
||||
else()
|
||||
set(HIP_CLANG_ROOT /opt/rocm/llvm)
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" DIRECTORY)
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" DIRECTORY)
|
||||
endif()
|
||||
file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
|
||||
find_path(HIP_CLANG_INCLUDE_PATH stddef.h
|
||||
@@ -89,11 +96,6 @@ find_dependency(amd_comgr)
|
||||
|
||||
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
|
||||
|
||||
#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
|
||||
if( DEFINED ENV{ROCM_PATH} )
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
|
||||
#Using find_dependecy to locate the dependency for the packagaes
|
||||
#This makes the cmake generated file xxxx-targets to supply the linker libraries
|
||||
# without worrying other transitive dependencies
|
||||
|
||||
@@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl(long long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
|
||||
__device__
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
@@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
|
||||
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
|
||||
@@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
@@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
@@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
#define MASK1 0x00ff00ff
|
||||
#define MASK2 0xff00ff00
|
||||
|
||||
|
||||
@@ -487,6 +487,22 @@ struct __HIP_Coordinates {
|
||||
#endif
|
||||
|
||||
};
|
||||
template <typename F>
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__((weak))
|
||||
#endif
|
||||
constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
|
||||
template <typename F>
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__((weak))
|
||||
#endif
|
||||
constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
|
||||
template <typename F>
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__((weak))
|
||||
#endif
|
||||
constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
|
||||
|
||||
extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
|
||||
inline
|
||||
__device__
|
||||
|
||||
@@ -345,13 +345,16 @@ typedef struct hipLaunchParams_t {
|
||||
hipStream_t stream; ///< Stream identifier
|
||||
} hipLaunchParams;
|
||||
|
||||
// Pre-Compiled header for online compilation
|
||||
#ifdef ENABLE_HIP_PCH
|
||||
extern const char* __hip_pch;
|
||||
extern unsigned __hip_pch_size;
|
||||
void __hipGetPCH(const char** pch, unsigned int*size);
|
||||
#if __HIP_HAS_GET_PCH
|
||||
/**
|
||||
* Internal use only. This API may change in the future
|
||||
* Pre-Compiled header for online compilation
|
||||
*
|
||||
*/
|
||||
void __hipGetPCH(const char** pch, unsigned int*size);
|
||||
#endif
|
||||
|
||||
|
||||
// Doxygen end group GlobalDefs
|
||||
/** @} */
|
||||
|
||||
|
||||
@@ -28,14 +28,17 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
|
||||
#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
|
||||
#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
|
||||
|
||||
#include <hip/hip_version.h>
|
||||
#include <hip/hip_common.h>
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
|
||||
#if __cplusplus
|
||||
#if __cplusplus && defined(__clang__) && defined(__HIP__)
|
||||
#include <hip/hcc_detail/hip_cooperative_groups.h>
|
||||
#endif
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
|
||||
#include <cooperative_groups.h>
|
||||
#include <hip/nvcc_detail/hip_cooperative_groups.h>
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
|
||||
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
#include <string.h> // for getDeviceProp
|
||||
#include <hip/hip_version.h>
|
||||
#include <hip/hip_common.h>
|
||||
|
||||
enum {
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
|
||||
// Include CUDA headers
|
||||
#include <cuda_runtime.h>
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
// Include HIP wrapper headers around CUDA
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
@@ -104,13 +104,13 @@ typedef int hipLaunchParm;
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
#define abort() \
|
||||
#define abort_() \
|
||||
{ asm("trap;"); }
|
||||
#undef assert
|
||||
#define assert(COND) \
|
||||
{ \
|
||||
if (!COND) { \
|
||||
abort(); \
|
||||
abort_(); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -26,6 +26,7 @@ THE SOFTWARE.
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
||||
@@ -20,6 +20,7 @@ target_include_directories(lpl
|
||||
|
||||
target_compile_options(lpl PUBLIC -Wall)
|
||||
target_link_libraries(lpl PUBLIC pthread)
|
||||
add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl)
|
||||
|
||||
install(TARGETS lpl RUNTIME DESTINATION bin)
|
||||
#-------------------------------------LPL--------------------------------------#
|
||||
@@ -43,6 +44,7 @@ find_package(hsa-runtime64 REQUIRED CONFIG
|
||||
|
||||
target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 )
|
||||
target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)
|
||||
add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca)
|
||||
|
||||
install(TARGETS ca RUNTIME DESTINATION bin)
|
||||
#-------------------------------------CA---------------------------------------#
|
||||
|
||||
@@ -21,22 +21,23 @@ set(CPACK_PACKAGE_NAME "hip-base")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
|
||||
@@ -24,25 +24,26 @@ set(CPACK_PACKAGE_NAME "hip-doc")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc")
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
|
||||
|
||||
@@ -28,24 +28,29 @@ endif()
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
|
||||
|
||||
@@ -10,28 +10,29 @@ set(CPACK_PACKAGE_NAME "hip-nvcc")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc")
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
|
||||
|
||||
@@ -33,27 +33,28 @@ set(HCC_PACKAGE_NAME "rocclr")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), comgr (>= 1.1), llvm-amdgpu")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, comgr >= 1.1, llvm-amdgpu")
|
||||
set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
|
||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||
|
||||
@@ -12,25 +12,26 @@ set(CPACK_PACKAGE_NAME "hip-samples")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
|
||||
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
|
||||
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
|
||||
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
|
||||
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples")
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
|
||||
|
||||
@@ -96,6 +96,14 @@ find_package(amd_comgr REQUIRED CONFIG
|
||||
|
||||
message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
|
||||
|
||||
find_package(LLVM REQUIRED CONFIG
|
||||
PATHS
|
||||
/opt/rocm/llvm
|
||||
PATH_SUFFIXES
|
||||
lib/cmake/llvm)
|
||||
|
||||
message(STATUS "llvm found at ${LLVM_DIR}.")
|
||||
|
||||
add_library(hip64 OBJECT
|
||||
hip_context.cpp
|
||||
hip_code_object.cpp
|
||||
@@ -148,10 +156,9 @@ endif()
|
||||
|
||||
# Short-Term solution for pre-compiled headers for online compilation
|
||||
# Enable pre compiled header
|
||||
if(${ENABLE_HIP_PCH})
|
||||
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh")
|
||||
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh")
|
||||
add_definitions(-DENABLE_HIP_PCH)
|
||||
if(${__HIP_ENABLE_PCH})
|
||||
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}")
|
||||
add_definitions(-D__HIP_ENABLE_PCH)
|
||||
endif()
|
||||
|
||||
# Enable profiling API
|
||||
@@ -216,7 +223,7 @@ add_library(device INTERFACE)
|
||||
target_link_libraries(device INTERFACE host)
|
||||
|
||||
# Short-Term solution for pre-compiled headers for online compilation
|
||||
if(${ENABLE_HIP_PCH})
|
||||
if(${__HIP_ENABLE_PCH})
|
||||
target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
|
||||
endif()
|
||||
|
||||
@@ -227,6 +234,18 @@ endif()
|
||||
# filename.
|
||||
if(${BUILD_SHARED_LIBS})
|
||||
target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
|
||||
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
|
||||
${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING}
|
||||
${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR})
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
|
||||
${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}
|
||||
${PROJECT_BINARY_DIR}/lib/libhip_hcc.so)
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
|
||||
${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
|
||||
|
||||
INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
|
||||
else()
|
||||
target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
|
||||
@@ -244,6 +263,7 @@ else()
|
||||
INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
|
||||
endif()
|
||||
|
||||
|
||||
INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
|
||||
INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
|
||||
|
||||
|
||||
@@ -202,19 +202,10 @@ hipError_t DynCO::populateDynGlobalVars() {
|
||||
return hipErrorSharedObjectSymbolNotFound;
|
||||
}
|
||||
|
||||
if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) {
|
||||
DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module());
|
||||
return hipErrorSharedObjectSymbolNotFound;
|
||||
}
|
||||
|
||||
for (auto& elem : var_names) {
|
||||
vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
|
||||
}
|
||||
|
||||
for (auto& elem : undef_var_names) {
|
||||
vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr)));
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
@@ -377,20 +368,4 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
|
||||
*size_ptr = dvar->size();
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
|
||||
hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
|
||||
amd::ScopedLock lock(sclock_);
|
||||
|
||||
for (auto& elem : vars_) {
|
||||
if ((elem.second->name() == hostVar)
|
||||
&& (elem.second->module(deviceId) == hmod)) {
|
||||
*dev_ptr = elem.second->device_ptr(deviceId);
|
||||
*size_ptr = elem.second->device_size(deviceId);
|
||||
return hipSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
return hipErrorNotFound;
|
||||
}
|
||||
}; //namespace: hip
|
||||
|
||||
@@ -118,8 +118,6 @@ public:
|
||||
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
|
||||
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
|
||||
size_t* size_ptr);
|
||||
hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
|
||||
hipDeviceptr_t* dev_ptr, size_t* size_ptr);
|
||||
|
||||
private:
|
||||
friend class ::PlatformState;
|
||||
|
||||
@@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
|
||||
::strncpy(deviceProps.name, info.boardName_, 128);
|
||||
deviceProps.totalGlobalMem = info.globalMemSize_;
|
||||
deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
|
||||
deviceProps.regsPerBlock = info.availableSGPRs_;
|
||||
deviceProps.regsPerBlock = info.availableRegistersPerCU_;
|
||||
deviceProps.warpSize = info.wavefrontWidth_;
|
||||
deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
|
||||
deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
|
||||
|
||||
@@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
|
||||
}
|
||||
|
||||
FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
|
||||
: fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) {
|
||||
: fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) {
|
||||
guarantee(fname || image);
|
||||
|
||||
if (fname != nullptr) {
|
||||
@@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() {
|
||||
}
|
||||
|
||||
fname_ = std::string();
|
||||
fdesc_ = -1;
|
||||
fdesc_ = amd::Os::FDescInit();
|
||||
fsize_ = 0;
|
||||
image_ = nullptr;
|
||||
uri_ = std::string();
|
||||
@@ -64,6 +64,9 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devi
|
||||
if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
|
||||
return hipErrorFileNotFound;
|
||||
}
|
||||
if (fsize_ == 0) {
|
||||
return hipErrorInvalidKernelFile;
|
||||
}
|
||||
|
||||
// Extract the code object from file
|
||||
hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_,
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
#include "hip_code_object.hpp"
|
||||
#include "platform/program.hpp"
|
||||
|
||||
#ifdef ENABLE_HIP_PCH
|
||||
#ifdef __HIP_ENABLE_PCH
|
||||
extern const char __hip_pch[];
|
||||
extern unsigned __hip_pch_size;
|
||||
void __hipGetPCH(const char** pch, unsigned int *size) {
|
||||
*pch = __hip_pch;
|
||||
*size = __hip_pch_size;
|
||||
|
||||
@@ -95,11 +95,6 @@ public:
|
||||
hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
|
||||
void resize_dVar(size_t size) { dVar_.resize(size); }
|
||||
|
||||
//Accessor for device_ptrs.
|
||||
std::string name() const { return name_; }
|
||||
hipModule_t module(int deviceId) const { return nullptr; }
|
||||
hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); }
|
||||
size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); }
|
||||
FatBinaryInfo** moduleInfo() { return modules_; };
|
||||
|
||||
private:
|
||||
|
||||
@@ -252,8 +252,6 @@ extern int ihipGetDevice();
|
||||
extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
|
||||
extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
|
||||
extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
|
||||
size_t* var_size);
|
||||
|
||||
constexpr bool kOptionChangeable = true;
|
||||
constexpr bool kNewDevProg = false;
|
||||
|
||||
@@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
if (*ptr == nullptr) {
|
||||
size_t free = 0, total =0;
|
||||
hipMemGetInfo(&free, &total);
|
||||
LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total);
|
||||
LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total);
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
|
||||
@@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
|
||||
}
|
||||
} else {
|
||||
amd::HostQueue* pQueue = &queue;
|
||||
if (queueDevice != srcMemory->getContext().devices()[0]) {
|
||||
if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
|
||||
(queueDevice != srcMemory->getContext().devices()[0])) {
|
||||
pQueue = hip::getNullStream(srcMemory->getContext());
|
||||
amd::Command* cmd = queue.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
}
|
||||
|
||||
command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
|
||||
*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
|
||||
}
|
||||
@@ -1850,18 +1850,27 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
|
||||
hipExtent extent,
|
||||
hipStream_t stream,
|
||||
bool isAsync = false) {
|
||||
if (pitchedDevPtr.pitch == extent.width) {
|
||||
return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync);
|
||||
}
|
||||
|
||||
// Workaround for cases when pitch > row untill fill kernel will be updated to support pitch.
|
||||
// Fallback to filling one row at a time.
|
||||
|
||||
amd::HostQueue* queue = hip::getQueue(stream);
|
||||
|
||||
size_t offset = 0;
|
||||
amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
|
||||
|
||||
auto sizeBytes = extent.width * extent.height * extent.depth;
|
||||
|
||||
if (memory == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (sizeBytes > memory->getSize()) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (pitchedDevPtr.pitch == extent.width) {
|
||||
return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast<size_t>(sizeBytes), stream, isAsync);
|
||||
}
|
||||
|
||||
// Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
|
||||
// Fall back to filling one row at a time.
|
||||
|
||||
amd::HostQueue* queue = hip::getQueue(stream);
|
||||
|
||||
amd::Coord3D origin(offset);
|
||||
amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
|
||||
amd::BufferRect rect;
|
||||
@@ -1870,34 +1879,26 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (memory != nullptr) {
|
||||
std::vector<amd::FillMemoryCommand*> commands;
|
||||
std::vector<amd::FillMemoryCommand*> commands;
|
||||
|
||||
for (size_t slice = 0; slice < extent.depth; slice++) {
|
||||
for (size_t row = 0; row < extent.height; row++) {
|
||||
const size_t rowOffset = rect.offset(0, row, slice);
|
||||
amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue,
|
||||
CL_COMMAND_FILL_BUFFER,
|
||||
amd::Command::EventWaitList{},
|
||||
*memory->asBuffer(),
|
||||
&value,
|
||||
sizeof(int8_t),
|
||||
amd::Coord3D{rowOffset, 0, 0},
|
||||
amd::Coord3D{extent.width, 1, 1});
|
||||
for (size_t slice = 0; slice < extent.depth; slice++) {
|
||||
for (size_t row = 0; row < extent.height; row++) {
|
||||
const size_t rowOffset = rect.offset(0, row, slice);
|
||||
amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue,
|
||||
CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { },
|
||||
*memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset,
|
||||
0, 0 }, amd::Coord3D { extent.width, 1, 1 });
|
||||
|
||||
command->enqueue();
|
||||
commands.push_back(command);
|
||||
}
|
||||
command->enqueue();
|
||||
commands.push_back(command);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &command: commands) {
|
||||
if (!isAsync) {
|
||||
command->awaitCompletion();
|
||||
}
|
||||
command->release();
|
||||
for (auto &command : commands) {
|
||||
if (!isAsync) {
|
||||
command->awaitCompletion();
|
||||
}
|
||||
} else {
|
||||
return hipErrorInvalidValue;
|
||||
command->release();
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
@@ -2038,7 +2039,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void
|
||||
memset(attributes, 0, sizeof(hipPointerAttribute_t));
|
||||
|
||||
if (memObj != nullptr) {
|
||||
attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
|
||||
attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
|
||||
if (attributes->memoryType == hipMemoryTypeHost) {
|
||||
attributes->hostPointer = static_cast<char*>(memObj->getSvmPtr()) + offset;
|
||||
}
|
||||
|
||||
@@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
if (result != hipSuccess) {
|
||||
break;
|
||||
}
|
||||
prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
|
||||
prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
|
||||
}
|
||||
|
||||
// Sync the execution streams on all devices
|
||||
|
||||
@@ -97,6 +97,10 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
|
||||
uint32_t* linktype, uint32_t* hopcount) {
|
||||
HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);
|
||||
|
||||
if (linktype == nullptr || hopcount == nullptr ||
|
||||
device1 == device2 || device1 < 0 || device2 < 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
// Fill out the list of LinkAttributes
|
||||
std::vector<amd::Device::LinkAttrType> link_attrs;
|
||||
link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
|
||||
|
||||
@@ -80,27 +80,6 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
|
||||
return PlatformState::instance().addFatBinary(fbwrapper->binary);
|
||||
}
|
||||
|
||||
bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod,
|
||||
void** var_addr, size_t* var_size) {
|
||||
|
||||
amd::ScopedLock lock(lock_);
|
||||
if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
|
||||
size_t* var_size) {
|
||||
return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast<hipModule_t>(program),
|
||||
var_addr, var_size);
|
||||
}
|
||||
|
||||
extern "C" void __hipRegisterFunction(
|
||||
hip::FatBinaryInfo** modules,
|
||||
const void* hostFunction,
|
||||
@@ -686,11 +665,19 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {
|
||||
return s | v;
|
||||
}
|
||||
|
||||
extern "C" __attribute__((weak)) float __gnu_h2f_ieee(unsigned short h){
|
||||
extern "C"
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__((weak))
|
||||
#endif
|
||||
float __gnu_h2f_ieee(unsigned short h){
|
||||
return __convert_half_to_float((std::uint32_t) h);
|
||||
}
|
||||
|
||||
extern "C" __attribute__((weak)) unsigned short __gnu_f2h_ieee(float f){
|
||||
extern "C"
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__((weak))
|
||||
#endif
|
||||
unsigned short __gnu_f2h_ieee(float f){
|
||||
return (unsigned short)__convert_float_to_half(f);
|
||||
}
|
||||
|
||||
@@ -765,6 +752,9 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
|
||||
DevLogPrintfError("Cannot find the module: 0x%x", hmod);
|
||||
return hipErrorNotFound;
|
||||
}
|
||||
if (0 == strlen(func_name)) {
|
||||
return hipErrorNotFound;
|
||||
}
|
||||
|
||||
return it->second->getDynFunc(hfunc, func_name);
|
||||
}
|
||||
@@ -868,11 +858,6 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi
|
||||
return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
|
||||
}
|
||||
|
||||
hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
|
||||
hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
|
||||
return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr);
|
||||
}
|
||||
|
||||
void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
|
||||
auto& arguments = execStack_.top().arguments_;
|
||||
|
||||
|
||||
@@ -77,11 +77,6 @@ public:
|
||||
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
|
||||
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
|
||||
size_t* size_ptr);
|
||||
hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
|
||||
hipDeviceptr_t* dev_ptr, size_t* size_ptr);
|
||||
|
||||
bool getShadowVarInfo(std::string var_name, hipModule_t hmod,
|
||||
void** var_addr, size_t* var_size);
|
||||
|
||||
//Exec Functions
|
||||
void setupArgument(const void *arg, size_t size, size_t offset);
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
project(bit_extract)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(bit_extract bit_extract.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(bit_extract hip::host)
|
||||
@@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
# Show how to use PLATFORM to specify different options for each compiler:
|
||||
ifeq (${HIP_PLATFORM}, nvcc)
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
endif
|
||||
|
||||
EXE=bit_extract
|
||||
EXE_STATIC=bit_extract_static
|
||||
|
||||
$(EXE): bit_extract.cpp
|
||||
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
|
||||
|
||||
$(EXE_STATIC): bit_extract.cpp
|
||||
$(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@
|
||||
|
||||
all: $(EXE) $(EXE_STATIC)
|
||||
all: $(EXE)
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE) $(EXE_STATIC)
|
||||
rm -f *.o $(EXE)
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
project(module_api)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(runKernel.hip.out runKernel.cpp)
|
||||
add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
|
||||
add_executable(defaultDriver.hip.out defaultDriver.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(runKernel.hip.out codeobj)
|
||||
add_dependencies(launchKernelHcc.hip.out codeobj)
|
||||
add_dependencies(defaultDriver.hip.out codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(runKernel.hip.out hip::host)
|
||||
target_link_libraries(launchKernelHcc.hip.out hip::host)
|
||||
target_link_libraries(defaultDriver.hip.out hip::host)
|
||||
@@ -0,0 +1,30 @@
|
||||
project(modile_api_global)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(runKernel.hip.out runKernel.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(runKernel.hip.out codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(runKernel.hip.out hip::host)
|
||||
@@ -0,0 +1,21 @@
|
||||
#Follow "README.md" to generate square.cpp if it's missing
|
||||
|
||||
project(square)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(square square.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(square hip::host)
|
||||
@@ -11,7 +11,7 @@ else
|
||||
SOURCES=square.cpp
|
||||
endif
|
||||
|
||||
all: square.out square.out.static
|
||||
all: square.out
|
||||
|
||||
# Step
|
||||
square.cpp: square.cu
|
||||
@@ -20,8 +20,5 @@ square.cpp: square.cu
|
||||
square.out: $(SOURCES)
|
||||
$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
|
||||
|
||||
square.out.static: $(SOURCES)
|
||||
$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
|
||||
|
||||
clean:
|
||||
rm -f *.o *.out *.out.static square.cpp
|
||||
rm -f *.o *.out square.cpp
|
||||
|
||||
@@ -1,13 +1,39 @@
|
||||
# Square.md
|
||||
|
||||
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
|
||||
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
|
||||
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
|
||||
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
|
||||
Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
|
||||
|
||||
1. Add hip/bin path to the PATH :
|
||||
<code>export PATH=$PATH:[MYHIP]/bin</code>
|
||||
- Add hip/bin path to the PATH
|
||||
|
||||
2. <code>$ make </code>
|
||||
Make runs these steps. This can be performed on either CUDA or AMD platform:
|
||||
<code>hipify-perl square.cu > square.cpp </code> # convert cuda code to hip code
|
||||
<code>hipcc square.cpp</code> # compile into executable
|
||||
```
|
||||
$ export PATH=$PATH:[MYHIP]/bin
|
||||
```
|
||||
|
||||
- Define environment variable
|
||||
|
||||
```
|
||||
$ export HIP_PATH=[MYHIP]
|
||||
```
|
||||
|
||||
- Build executible file
|
||||
|
||||
```
|
||||
$ cd ~/hip/samples/0_Intro/square
|
||||
$ make
|
||||
/home/user/hip/bin/hipify-perl square.cu > square.cpp
|
||||
/home/user/hip/bin/hipcc square.cpp -o square.out
|
||||
/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static
|
||||
```
|
||||
- Execute file
|
||||
```
|
||||
$ ./square.out
|
||||
info: running on device Navi 14 [Radeon Pro W5500]
|
||||
info: allocate host mem ( 7.63 MB)
|
||||
info: allocate device mem ( 7.63 MB)
|
||||
info: copy Host2Device
|
||||
info: launch 'vector_square' kernel
|
||||
info: copy Device2Host
|
||||
info: check result
|
||||
PASSED!
|
||||
```
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
project(hipBusBandwidth)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipBusBandwidth hip::host)
|
||||
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
|
||||
bool p_verbose = false;
|
||||
MallocMode p_malloc_mode = MallocPinned;
|
||||
int p_numa_ctl = -1;
|
||||
int p_iterations = 10;
|
||||
int p_iterations = 0;
|
||||
int p_beatsperiteration = 1;
|
||||
int p_device = 0;
|
||||
int p_detailed = 0;
|
||||
@@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
|
||||
int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256,
|
||||
512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
|
||||
int nSizes = sizeof(sizes) / sizeof(int);
|
||||
|
||||
// iterations to be run for the corresponding sizes, less number as the size increases
|
||||
int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
|
||||
200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
|
||||
|
||||
// ****************************************************************************
|
||||
// Function: RunBenchmark_H2D
|
||||
@@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
@@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
@@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
|
||||
hipStreamCreate(&stream[0]);
|
||||
hipStreamCreate(&stream[1]);
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
|
||||
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
|
||||
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
resultDB.AddResult(
|
||||
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
resultDB.AddResult(
|
||||
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
@@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
|
||||
hipDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
|
||||
}
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
|
||||
hipStreamCreate(&stream[0]);
|
||||
hipStreamCreate(&stream[1]);
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
|
||||
hipDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[0]);
|
||||
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
|
||||
t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[0]);
|
||||
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
|
||||
t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
project(hipCommander)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipCommander hipCommander.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(hipCommander codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipCommander hip::host)
|
||||
set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
|
||||
@@ -0,0 +1,35 @@
|
||||
project(hipDispatchLatency)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipDispatchLatency hipDispatchLatency.cpp)
|
||||
add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(hipDispatchLatency codeobj)
|
||||
add_dependencies(hipDispatchEnqueueRateMT codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipDispatchLatency hip::host)
|
||||
target_link_libraries(hipDispatchEnqueueRateMT hip::host)
|
||||
set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(hipInfo)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipInfo hipInfo.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipInfo hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(MatrixTranspose)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(MatrixTranspose MatrixTranspose.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(MatrixTranspose hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(inline_asm)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(inline_asm inline_asm.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(inline_asm hip::host)
|
||||
@@ -0,0 +1,30 @@
|
||||
project(texture2dDrv)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(texture2dDrv texture2dDrv.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(texture2dDrv codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(texture2dDrv hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(occupancy)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(occupancy occupancy.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(occupancy hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(hipEvent)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipEvent hipEvent.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipEvent hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(sharedMemory)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(sharedMemory sharedMemory.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(sharedMemory hip::host)
|
||||
@@ -0,0 +1,20 @@
|
||||
project(shfl)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(shfl shfl.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(shfl hip::host)
|
||||
@@ -0,0 +1,19 @@
|
||||
project(2dshfl)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(2dshfl 2dshfl.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(2dshfl hip::host)
|
||||
@@ -0,0 +1,19 @@
|
||||
project(dynamic_shared)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(dynamic_shared dynamic_shared.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(dynamic_shared hip::host)
|
||||
@@ -0,0 +1,19 @@
|
||||
project(stream)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(stream stream.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(stream hip::host)
|
||||
@@ -0,0 +1,19 @@
|
||||
project(peer2peer)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(peer2peer peer2peer.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(peer2peer hip::host)
|
||||
@@ -0,0 +1,19 @@
|
||||
project(unroll)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(unroll unroll.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(unroll hip::host)
|
||||
@@ -0,0 +1,27 @@
|
||||
Build procedure
|
||||
|
||||
We provide Makefile and CMakeLists.txt to build the samples seperately.
|
||||
|
||||
1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
|
||||
|
||||
To build a sample, just type in sample folder,
|
||||
|
||||
make
|
||||
|
||||
|
||||
|
||||
2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
|
||||
|
||||
To build a sample, type in sample folder,
|
||||
|
||||
mkdir build (if build folder is missing)
|
||||
|
||||
cd build
|
||||
|
||||
cmake ..
|
||||
|
||||
make
|
||||
|
||||
If you want debug version, follow,
|
||||
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug ..
|
||||
Обычный файл → Исполняемый файл
@@ -303,6 +303,7 @@ macro(MAKE_TEST _config exe)
|
||||
add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN})
|
||||
endif()
|
||||
set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
|
||||
set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
|
||||
endmacro()
|
||||
|
||||
macro(MAKE_NAMED_TEST _config exe testname)
|
||||
|
||||
@@ -0,0 +1,747 @@
|
||||
/*
|
||||
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/math_functions.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
coordRec coords[] = {
|
||||
{0.0, 0.0, 4.0}, // Whole set
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*(float)i);
|
||||
float y0 = (float)(yPos + yStep*(float)j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
float savx = x;
|
||||
float savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16) {
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16) {
|
||||
#endif
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while (stay && iter);
|
||||
}
|
||||
|
||||
|
||||
out[tid] = (uint)ccount;
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*i);
|
||||
double y0 = (double)(yPos + yStep*j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*(double)i);
|
||||
double y0 = (double)(yPos + yStep*(double)j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
double savx = x;
|
||||
double savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16)
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16)
|
||||
#endif
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
}
|
||||
while (stay && iter);
|
||||
|
||||
}
|
||||
out[tid] = (uint)ccount;
|
||||
};
|
||||
|
||||
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
|
||||
|
||||
// Expected results for each kernel run at each coord
|
||||
unsigned long long expectedIters[] = {
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
|
||||
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
|
||||
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
|
||||
class hipPerfMandelBrot {
|
||||
public:
|
||||
hipPerfMandelBrot();
|
||||
~hipPerfMandelBrot();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testCase, unsigned int deviceId);
|
||||
void printResults(void);
|
||||
|
||||
// array of funtion pointers
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
// Wrappers
|
||||
void float_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
hipStream_t streams[2];
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
|
||||
std::map<std::string, std::vector<double>> results;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int maxIter;
|
||||
unsigned int coordIdx;
|
||||
volatile unsigned long long totalIters = 0;
|
||||
int numCUs;
|
||||
static const unsigned int numLoops = 10;
|
||||
};
|
||||
|
||||
|
||||
hipPerfMandelBrot::hipPerfMandelBrot() {}
|
||||
|
||||
hipPerfMandelBrot::~hipPerfMandelBrot() {}
|
||||
|
||||
void hipPerfMandelBrot::open(int deviceId) {
|
||||
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if (nGpu < 1) {
|
||||
std::cout << "info: didn't find any GPU! skipping the test!\n";
|
||||
passed();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
|
||||
<< std::endl;
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::printResults() {
|
||||
|
||||
int numkernels = getNumKernels();
|
||||
int numStreams = getNumStreams();
|
||||
|
||||
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
|
||||
<< numStreams << " streams (s)" << std::endl;
|
||||
|
||||
std::map<std::string, std::vector<double>>:: iterator itr;
|
||||
for (itr = results.begin(); itr != results.end(); itr++) {
|
||||
std::cout << "\n" << std::setw(20) << itr->first << " ";
|
||||
for(auto i : results[itr->first]) {
|
||||
std::cout << std::setw(10) << i << " ";
|
||||
}
|
||||
}
|
||||
results.clear();
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
// Wrappers for the kernel launches
|
||||
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
|
||||
maxIter);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
unsigned int numStreams = getNumStreams();
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
|
||||
// Maximum iteration count
|
||||
maxIter = 32768;
|
||||
|
||||
uint * hPtr[numKernels];
|
||||
uint * dPtr[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
}
|
||||
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
int kernelIdx;
|
||||
if(testCase == 0 || testCase == 5 || testCase == 10) {
|
||||
kernelIdx = 0;
|
||||
}
|
||||
|
||||
else if(testCase == 1 || testCase == 6 || testCase == 11) {
|
||||
kernelIdx = 1;
|
||||
}
|
||||
else if(testCase == 2 || testCase == 7 || testCase == 12) {
|
||||
kernelIdx = 2;
|
||||
}
|
||||
else if(testCase == 3 || testCase == 8 || testCase == 13){
|
||||
kernelIdx = 3;
|
||||
}
|
||||
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < numLoops; k++) {
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
|
||||
testCase == 5 || testCase == 6 || testCase == 7 ||
|
||||
testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
|
||||
}
|
||||
|
||||
|
||||
else {
|
||||
double xStep = coords[coordIdx].width / (double)width_;
|
||||
double yStep = -coords[coordIdx].width / (double)width_;
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
checkData(hPtr[i]);
|
||||
|
||||
int j =0;
|
||||
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
|
||||
j++;
|
||||
}
|
||||
|
||||
if(j==30) {
|
||||
std::cout << "Incorrect iteration count detected. ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Compute GFLOPS. There are 7 FLOPs per iteration
|
||||
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
|
||||
(totalTime / (double)numLoops);
|
||||
|
||||
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll",
|
||||
"double", "double_unroll"};
|
||||
|
||||
// Print results except for Warm-up kernel
|
||||
if(testCase!=100) {
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0 ; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfMandelBrot mandelbrotCompute;
|
||||
int deviceId = 0;
|
||||
|
||||
mandelbrotCompute.open(deviceId);
|
||||
|
||||
for (unsigned int testCase = 0; testCase < 3; testCase++) {
|
||||
|
||||
|
||||
switch (testCase) {
|
||||
|
||||
|
||||
case 0: {
|
||||
// Warmup-kernel - default stream executes serially
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(100/*Random number*/, deviceId);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 1: {
|
||||
// run all - sync
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 2: {
|
||||
// run all - async
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(2);
|
||||
mandelbrotCompute.setNumKernels(2);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,289 @@
|
||||
/*
|
||||
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter) {
|
||||
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
class hipPerfDeviceConcurrency {
|
||||
public:
|
||||
hipPerfDeviceConcurrency();
|
||||
~hipPerfDeviceConcurrency();
|
||||
|
||||
void setNumGpus(unsigned int num) {
|
||||
numDevices = num;
|
||||
}
|
||||
unsigned int getNumGpus() {
|
||||
return numDevices;
|
||||
}
|
||||
|
||||
void open(void);
|
||||
void close(void);
|
||||
void run(unsigned int testCase, int numGpus);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numDevices;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int coordIdx;
|
||||
unsigned long long totalIters = 0;
|
||||
};
|
||||
|
||||
|
||||
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
|
||||
|
||||
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
|
||||
|
||||
void hipPerfDeviceConcurrency::open(void) {
|
||||
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
setNumGpus(nGpu);
|
||||
if (nGpu < 1) {
|
||||
std::cout << "info: didn't find any GPU! skipping the test!\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::close() {
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
|
||||
|
||||
static int deviceId;
|
||||
uint * hPtr[numGpus];
|
||||
uint * dPtr[numGpus];
|
||||
hipStream_t streams[numGpus];
|
||||
int numCUs[numGpus];
|
||||
unsigned int maxIter[numGpus];
|
||||
unsigned long long expectedIters[numGpus];
|
||||
|
||||
int threads, threads_per_block, blocks;
|
||||
float xStep, yStep, xPos, yPos;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, i));
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
|
||||
<< i << std::endl;
|
||||
}
|
||||
|
||||
numCUs[i] = props.multiProcessorCount;
|
||||
int clkFrequency = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
|
||||
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
|
||||
maxIter[i] = (maxIter[i] + 15) & ~15;
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
|
||||
// Allocate memory on the host and device
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
threads = (bufSize/sizeof(uint));
|
||||
threads_per_block = 64;
|
||||
blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory from host to device
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
|
||||
}
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
|
||||
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
|
||||
|
||||
}
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
}
|
||||
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
// Copy data back from device to the host
|
||||
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
|
||||
checkData(hPtr[i]);
|
||||
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
|
||||
|
||||
if (testCase != 0) {
|
||||
checkData(hPtr[i]);
|
||||
if(totalIters != expectedIters[i]) {
|
||||
std::cout << "Incorrect iteration count detected" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
|
||||
// Free host and device memory
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
|
||||
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
|
||||
}
|
||||
|
||||
if(testCase == 0) {
|
||||
deviceId++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfDeviceConcurrency deviceConcurrency;
|
||||
|
||||
deviceConcurrency.open();
|
||||
|
||||
int nGpu = deviceConcurrency.getNumGpus();
|
||||
|
||||
// testCase = 0 refers to warmup kernel run
|
||||
int testCase = 0;
|
||||
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
// Warm-up kernel on all devices
|
||||
deviceConcurrency.run(testCase, 1);
|
||||
}
|
||||
|
||||
// Time for kernel on 1 device
|
||||
deviceConcurrency.run(++testCase, 1);
|
||||
|
||||
// Time for kernel on all available devices
|
||||
deviceConcurrency.run(++testCase, nGpu);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
|
||||
}
|
||||
}
|
||||
|
||||
void getFactor(int& fact) { fact = 101; }
|
||||
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
|
||||
void getFactor(float& fact) { fact = 2.5; }
|
||||
void getFactor(double& fact) { fact = 2.5; }
|
||||
void getFactor(long& fact) { fact = 202; }
|
||||
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
|
||||
void getFactor(long long& fact) { fact = 303; }
|
||||
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
|
||||
|
||||
template<typename T>
|
||||
void runTest() {
|
||||
T* Matrix;
|
||||
@@ -77,8 +86,10 @@ void runTest() {
|
||||
cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
// initialize the input data
|
||||
T factor;
|
||||
getFactor(factor);
|
||||
for (i = 0; i < NUM; i++) {
|
||||
Matrix[i] = (T)i * 10l;
|
||||
Matrix[i] = (T)i + factor;
|
||||
}
|
||||
|
||||
// allocate the memory on the device side
|
||||
@@ -124,7 +135,11 @@ void runTest() {
|
||||
int main() {
|
||||
runTest<int>();
|
||||
runTest<float>();
|
||||
runTest<double>();
|
||||
runTest<long>();
|
||||
runTest<long long>();
|
||||
runTest<unsigned int>();
|
||||
runTest<unsigned long>();
|
||||
runTest<unsigned long long>();
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) {
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXorSum(T* a, int size) {
|
||||
T val = a[threadIdx.x];
|
||||
for (int i = size/2; i > 0; i /= 2)
|
||||
val += __shfl_xor(val, i, size);
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
void getFactor(int& fact) { fact = 101; }
|
||||
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
|
||||
void getFactor(float& fact) { fact = 2.5; }
|
||||
void getFactor(double& fact) { fact = 2.5; }
|
||||
void getFactor(long& fact) { fact = 202; }
|
||||
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
|
||||
void getFactor(long long& fact) { fact = 303; }
|
||||
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
|
||||
|
||||
template <typename T>
|
||||
void runTestShflUp() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i;
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
@@ -73,8 +91,9 @@ void runTestShflDown() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i;
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
@@ -84,19 +103,58 @@ void runTestShflDown() {
|
||||
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
|
||||
if (a[0] != cpuSum) {
|
||||
hipFree(d_a);
|
||||
failed("Shfl Up Sum did not match.");
|
||||
failed("Shfl Down Sum did not match.");
|
||||
}
|
||||
hipFree(d_a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void runTestShflXor() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
hipMalloc(&d_a, sizeof(T) * size);
|
||||
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
|
||||
hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
|
||||
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
|
||||
if (a[0] != cpuSum) {
|
||||
hipFree(d_a);
|
||||
failed("Shfl Xor Sum did not match.");
|
||||
}
|
||||
hipFree(d_a);
|
||||
}
|
||||
int main() {
|
||||
runTestShflUp<int>();
|
||||
runTestShflUp<float>();
|
||||
runTestShflUp<double>();
|
||||
runTestShflUp<long>();
|
||||
runTestShflUp<long long>();
|
||||
runTestShflUp<unsigned int>();
|
||||
runTestShflUp<unsigned long>();
|
||||
runTestShflUp<unsigned long long>();
|
||||
|
||||
runTestShflDown<int>();
|
||||
runTestShflDown<float>();
|
||||
runTestShflDown<double>();
|
||||
runTestShflDown<long>();
|
||||
runTestShflDown<long long>();
|
||||
runTestShflDown<unsigned int>();
|
||||
runTestShflDown<unsigned long>();
|
||||
runTestShflDown<unsigned long long>();
|
||||
|
||||
runTestShflXor<int>();
|
||||
runTestShflXor<float>();
|
||||
runTestShflXor<double>();
|
||||
runTestShflXor<long>();
|
||||
runTestShflXor<long long>();
|
||||
runTestShflXor<unsigned int>();
|
||||
runTestShflXor<unsigned long>();
|
||||
runTestShflXor<unsigned long long>();
|
||||
passed();
|
||||
}
|
||||
|
||||
Обычный файл → Исполняемый файл
@@ -395,6 +395,9 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
if (gpuCount < 2) {
|
||||
printf("P2P application requires atleast 2 gpu devices\n");
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
}
|
||||
} else {
|
||||
if (p_tests & 0x100) {
|
||||
testPeerHostToDevice(false /*useAsyncCopy*/);
|
||||
|
||||
@@ -0,0 +1,280 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*The general idea of the application is to test how Cooperative Groups kernel
|
||||
launches work when launching too many warps to the target device. This test
|
||||
first queries the nominal warp size of the target device. It then walks through
|
||||
block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For
|
||||
each of these, it queries the maximum number of blocks that can fit in each SM.
|
||||
It then queries the number of SMs on the target device. This will yield a
|
||||
calculation for the maximum number of blocks that can be co-scheduled on this
|
||||
device.
|
||||
|
||||
The Cooperative Groups API says that users should not launch more than this
|
||||
many warps (or blocks, etc.) to the target device. This test first tires to
|
||||
launch 2x as many blcoks, to confirm that the runtime prevents such a launch
|
||||
by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`).
|
||||
|
||||
It then ensures that trying to launch too large of a kernel invocation does
|
||||
not break the GPU by launching a kernel with exactly the maximum number of
|
||||
blocks.
|
||||
|
||||
Finally, we run the same test for a block size that is larger than the maximum
|
||||
allowed by the device, to ensure that this case is properly detected by the
|
||||
runtime and that nothing breaks.*/
|
||||
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
|
||||
static inline void hipCheckAndFail(hipError_t errval,
|
||||
const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != hipSuccess) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
failed("");
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << " Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
#define hipCheckErr(errval) \
|
||||
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
|
||||
|
||||
static inline bool hipCheckExpected(hipError_t errval,
|
||||
hipError_t expected_err, const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != expected_err) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << " Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return false;
|
||||
}
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
__global__ void test_kernel(long long *array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += clock64();
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num, FailFlag = 0;
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned int *dev_array[2];
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
for (int dev = 0; dev < device_num; ++dev) {
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (!cooperative_groups_support(dev)) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Create the streams we will use in this test. **************************/
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* We will try to launch more waves than the GPU can fit. ***************/
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
int max_num_threads = device_properties.maxThreadsPerBlock;
|
||||
|
||||
// Check single-thread block, all numbers of warps, then too-large block
|
||||
for (int block_size = 0; block_size <= (max_num_threads + warp_size);
|
||||
block_size += warp_size) {
|
||||
if (block_size == 0) {
|
||||
block_size = 1;
|
||||
}
|
||||
int max_blocks_per_sm;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
&max_blocks_per_sm, test_kernel, block_size, 0,
|
||||
hipOccupancyDefault));
|
||||
|
||||
if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
|
||||
std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
|
||||
std::cerr << "blocks in a kernel when the block size is too large ";
|
||||
std::cerr << "to work on the device." << std::endl;
|
||||
std::cerr << "This is incorrect, and could possibly lead users ";
|
||||
std::cerr << "to try to launch kernels that will fail." << std::endl;
|
||||
//failed("");
|
||||
FailFlag = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
int desired_blocks = max_blocks_per_sm * num_sms;
|
||||
bool expect_fail = false;
|
||||
if (desired_blocks == 0) {
|
||||
desired_blocks = 1;
|
||||
expect_fail = true;
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
/* Set up data to pass into the kernel ********************************/
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int test_size;
|
||||
// Case where we expect to fail at launch.
|
||||
if (i == 0) {
|
||||
test_size = 2 * desired_blocks;
|
||||
} else {
|
||||
test_size = desired_blocks;
|
||||
}
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
|
||||
test_size * block_size * sizeof(long long)));
|
||||
HIPCHECK(hipMemsetAsync(dev_array[i], 0,
|
||||
test_size * block_size * sizeof(long long),
|
||||
streams[i]));
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
/***********************************************************************/
|
||||
/* Launch the kernels **************************************************/
|
||||
void *coop_params[2][1];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
}
|
||||
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
2 * desired_blocks, block_size,
|
||||
coop_params[0], 0, streams[0]);
|
||||
|
||||
hipError_t expect_to_see;
|
||||
if (expect_fail) {
|
||||
expect_to_see = hipErrorInvalidConfiguration;
|
||||
} else {
|
||||
expect_to_see = hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
|
||||
std::cerr << "ERROR! Tried to launch a cooperative kernel with ";
|
||||
std::cerr << "too many warps." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << hipGetErrorString(expect_to_see);
|
||||
std::cerr << " (" << expect_to_see << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, block_size,
|
||||
coop_params[1], 0, streams[1]);
|
||||
|
||||
if (expect_fail) {
|
||||
expect_to_see = hipErrorInvalidConfiguration;
|
||||
} else {
|
||||
expect_to_see = hipSuccess;
|
||||
}
|
||||
if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
|
||||
std::cerr << "ERROR! Tried to launch a cooperative kernel ";
|
||||
std::cerr << "with a normal size, but a block size of ";
|
||||
std::cerr << desired_blocks << std::endl;
|
||||
std::cerr << "This SHOULD have returned ";
|
||||
std::cerr << hipGetErrorString(expect_to_see);
|
||||
std::cerr << " (" << expect_to_see << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
if (block_size == 1) {
|
||||
block_size = 0;
|
||||
}
|
||||
for (int m = 0; m < 2; ++m) {
|
||||
HIPCHECK(hipFree(dev_array[m]));
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 2; ++m) {
|
||||
HIPCHECK(hipStreamDestroy(streams[m]));
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
for (int m = 0; m < 2; ++m) {
|
||||
HIPCHECK(hipFree(dev_array[m]));
|
||||
}
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,283 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*
|
||||
The general idea of the application is to test how Cooperative Groups kernel
|
||||
launches to a stream interact with other kernels being launched to different
|
||||
streams.
|
||||
|
||||
For example: the HIP runtime will force cooperative kernel launches to run
|
||||
serially, even if they are launched to different streams. However,
|
||||
cooperative kernel launches can run in parallel with regular kernels that
|
||||
are launched to other streams. This limitation is so that the cooperative
|
||||
kernels do not conflict with one another for resources and potentially
|
||||
deadlock the system.
|
||||
|
||||
As such, this benchmark tests three situations:
|
||||
|
||||
1. Launching a cooperative kernel by itself to stream[0]
|
||||
2. Launching two cooperative kernels in parallel to stream[0] and stream[1]
|
||||
3. Launching two cooperative kernels in parallel to stream[0] and stream[1]
|
||||
and launching a third non-cooperative kernel to stream[2]
|
||||
|
||||
We time how long it takes to run each of these benchmarks and print it as
|
||||
the output of the benchmark. The kernels themselves are just useless time-
|
||||
wasting code so that the kernel takes a meaningful amount of time on the
|
||||
GPU before it exits. We only launch a single wavefront for each kernel, so
|
||||
any serialization should not be because of GPU occupancy concerns.
|
||||
|
||||
If test #2 takes roughly twice as long as #1, that implies that cooperative
|
||||
kernels are properly serialized with each other by the runtime.
|
||||
|
||||
If test #3 takes the same amount of time as test #2, that implies that
|
||||
regular kernels can properly run in parallel with cooperative kernels.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static inline void hipCheckAndFail(hipError_t errval,
|
||||
const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != hipSuccess) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "Location: " << file << ":" << line << std::endl;
|
||||
failed("");
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << "Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << "Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
#define hipCheckErr(errval) \
|
||||
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+1000000)) {}
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
/*************************************************************************/
|
||||
int device_num = 0, loops = 1000, FailFlag = 0;
|
||||
/* Create the streams we will use in this test. **************************/
|
||||
hipStream_t streams[3];
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned long long *dev_array[3];
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
for (int dev = 0; dev < device_num; ++dev) {
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (!cooperative_groups_support(dev)) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* We will launch enough waves to fill up all of the GPU *****************/
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
int desired_blocks = 1;
|
||||
std::cout << "Device: " << dev << std::endl;
|
||||
std::cout << "Device name: " << device_properties.name << std::endl;
|
||||
|
||||
int max_blocks_per_sm;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
|
||||
test_kernel,
|
||||
warp_size, 0));
|
||||
|
||||
if (desired_blocks > max_blocks_per_sm * num_sms) {
|
||||
std::cerr << "The requested number of blocks will not fit on the GPU";
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "You requested " << desired_blocks << " but we can only ";
|
||||
std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl;
|
||||
failed("");
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
|
||||
warp_size * sizeof(long long)));
|
||||
HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long),
|
||||
streams[i]));
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
void *coop_params[3][2];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
coop_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
}
|
||||
|
||||
std::cout << "Launching a single cooperative kernel..." << std::endl;
|
||||
auto single_start = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, warp_size,
|
||||
coop_params[0], 0, streams[0]));
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
auto single_end = std::chrono::system_clock::now();
|
||||
std::cout << "Launching 2 cooperative kernels to different streams...";
|
||||
std::cout << std::endl;
|
||||
|
||||
auto double_start = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, warp_size,
|
||||
coop_params[0], 0, streams[0]));
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, warp_size,
|
||||
coop_params[1], 0, streams[1]));
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
auto double_end = std::chrono::system_clock::now();
|
||||
std::cout << "Launching 2 cooperative kernels and 1 normal kernel...";
|
||||
std::cout << std::endl;
|
||||
|
||||
auto triple_start = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, warp_size,
|
||||
coop_params[0], 0, streams[0]));
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, warp_size,
|
||||
coop_params[1], 0, streams[1]));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size),
|
||||
0, streams[2], loops, dev_array[2]);
|
||||
err = hipGetLastError();
|
||||
hipCheckErr(err);
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
auto triple_end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> single_kernel_time =
|
||||
(single_end - single_start);
|
||||
std::chrono::duration<double> double_kernel_time =
|
||||
(double_end - double_start);
|
||||
std::chrono::duration<double> triple_kernel_time =
|
||||
(triple_end - triple_start);
|
||||
|
||||
std::cout << "A single kernel took:" << std::endl;
|
||||
std::cout << " " << single_kernel_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "Two cooperative kernels that could run together took:";
|
||||
std::cout << std::endl;
|
||||
std::cout << " " << double_kernel_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "Two coop kernels and a third regular kernel took:";
|
||||
std::cout << std::endl << " ";
|
||||
std::cout << triple_kernel_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
|
||||
std::cout << "Testing whether these times make sense.." << std::endl;
|
||||
// Test that two cooperative kernels is roughly twice as long as one
|
||||
if (double_kernel_time < 1.8 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Two cooperative kernels launched at the same ";
|
||||
std::cerr << "time did not take roughly twice as long as a single ";
|
||||
std::cerr << "cooperative kernel." << std::endl;
|
||||
std::cerr << "Were they truly serialized?" << std::endl;
|
||||
FailFlag = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// Test that the three kernels together took roughly as long as two
|
||||
// cooperative kernels.
|
||||
if (triple_kernel_time > 1.1 * double_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Launching a normal kernel in parallel with two ";
|
||||
std::cerr << "back-to-back cooperative kernels still ended up taking ";
|
||||
std::cerr << "more than 10% longer than the two cooperative kernels ";
|
||||
std::cerr << "alone." << std::endl;
|
||||
std::cerr << "Is the normal kernel being serialized with the ";
|
||||
std::cerr << "cooperative kernels on different streams?" << std::endl;
|
||||
FailFlag = 1;
|
||||
break;
|
||||
}
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
HIPCHECK(hipFree(dev_array[k]));
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
HIPCHECK(hipFree(dev_array[k]));
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
failed("");
|
||||
}
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,303 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*The general idea of the application is to create a buffer of width N. N is a
|
||||
command line parameter, and the user will need to make sure that we can fit
|
||||
two buffers of N unsigned integers onto the target GPU at the same time.
|
||||
|
||||
We then launch a fixed number of warps to the GPU. This number is calculated
|
||||
to fill the GPU with as many warps as can simultaneously run on the GPU.
|
||||
The threads in these warps then walk over two arrays. First, values from
|
||||
A[offset] are added into B[offset]. After all of A is added into all of B
|
||||
in this element-wise manner, all of the waves barrier with one another.
|
||||
|
||||
After the barrier, the waves start adding values from B[mirror_offset] into
|
||||
A[offset]. Mirror offset means that the wave that is writing into A[7] is
|
||||
reading from B[7 before the last value]. This was probably written by a
|
||||
different thread before the barrier.
|
||||
|
||||
After going through this loop a certain number of times, the kernel ends and
|
||||
we read the arrays back out and recalculate this algorithm serially on the
|
||||
CPU. We compare the serial version to the version that has inter-thread data
|
||||
sharing and barriers and ensure they result in the same answer.
|
||||
|
||||
If they do have the same answer, then we can pretty confidently say that
|
||||
writing from thread X and then hitting a barrier allows thread Y to see the
|
||||
values.*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static inline void hipCheckAndFail(hipError_t errval,
|
||||
const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != hipSuccess) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
exit(errval);
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << " Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
#define hipCheckErr(errval)\
|
||||
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int verify_coop_arrays(unsigned int loops, unsigned int *host_input,
|
||||
unsigned int *first_array,
|
||||
unsigned int *second_array,
|
||||
unsigned int array_len) {
|
||||
unsigned int *host_first_array = host_input;
|
||||
unsigned int *host_second_array = (unsigned int*)calloc(array_len,
|
||||
sizeof(int));
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
for (int offset = 0; offset < array_len; offset++) {
|
||||
host_second_array[offset] += host_first_array[offset];
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < array_len; offset++) {
|
||||
unsigned int swizzle_offset = array_len - offset - 1;
|
||||
host_first_array[offset] += host_second_array[swizzle_offset];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < array_len; i++) {
|
||||
if (host_first_array[i] != first_array[i]) {
|
||||
std::cerr << "Test failure!" << std::endl;
|
||||
std::cerr << " host_first_array[" << i << "] contains the ";
|
||||
std::cerr << "value " << host_first_array[i] << std::endl;
|
||||
std::cerr << " GPU first_array[" << i << "] contains the ";
|
||||
std::cerr << "value " << first_array[i] << std::endl;
|
||||
return -1;
|
||||
}
|
||||
if (host_second_array[i] != second_array[i]) {
|
||||
std::cerr << "Test failure!" << std::endl;
|
||||
std::cerr << " host_second_array[" << i << "] contains the ";
|
||||
std::cerr << "value " << host_second_array[i] << std::endl;
|
||||
std::cerr << " GPU second_array[" << i << "] contains the ";
|
||||
std::cerr << "value " << second_array[i] << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Coop test appears to work properly!" << std::endl;
|
||||
free(host_second_array);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
coop_kernel(unsigned int *first_array, unsigned int *second_array,
|
||||
unsigned int loops, unsigned int array_len) {
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
unsigned int rank = grid.thread_rank();
|
||||
unsigned int grid_size = grid.size();
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// The goal of this loop is to directly add in values from
|
||||
// array one into array two, on a per-wave basis.
|
||||
for (int offset = rank; offset < array_len; offset += grid_size) {
|
||||
second_array[offset] += first_array[offset];
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
|
||||
// The goal of this loop is to pull data the "mirror" lane in
|
||||
// array two and add it back into array one. This causes inter-
|
||||
// thread swizzling.
|
||||
for (int offset = rank; offset < array_len; offset += grid_size) {
|
||||
unsigned int swizzle_offset = array_len - offset - 1;
|
||||
first_array[offset] += second_array[swizzle_offset];
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
/*************************************************************************/
|
||||
/* Parse the command line parameters *************************************/
|
||||
// Arguments to pull out of the command line.
|
||||
int device_num = 0, loops = 2, width = 4096, flag = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
for (int dev = 0; dev < device_num; ++dev) {
|
||||
std::cout << "Device number: " << dev << std::endl;
|
||||
std::cout << "Loops: " << loops << std::endl;
|
||||
std::cout << "Width: " << width << std::endl;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
|
||||
if (!cooperative_groups_support(dev)) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* We will launch enough waves to fill up all of the GPU *****************/
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
|
||||
std::cout << "Device name: " << device_properties.name << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
|
||||
coop_kernel,
|
||||
warp_size, 0));
|
||||
|
||||
int total_blocks = max_blocks_per_sm * num_sms;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Create the streams we will use in this test. **************************/
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned int *input_buffer = (unsigned int*)calloc(width,
|
||||
sizeof(unsigned int));
|
||||
for (int i = 0; i < width; i++) {
|
||||
input_buffer[i] = i;
|
||||
}
|
||||
|
||||
unsigned int *first_dev_array;
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&first_dev_array),
|
||||
width * sizeof(unsigned int)));
|
||||
|
||||
HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer,
|
||||
width * sizeof(unsigned int),
|
||||
hipMemcpyHostToDevice, streams[0]));
|
||||
|
||||
unsigned int *second_dev_array;
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&second_dev_array),
|
||||
width * sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int),
|
||||
streams[0]));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
std::cout << "Launching a cooperative kernel with " << total_blocks;
|
||||
std::cout << " thread blocks, each with " << warp_size << " threads";
|
||||
std::cout << std::endl;
|
||||
|
||||
void *coop_params[4];
|
||||
coop_params[0] = reinterpret_cast<void*>(&first_dev_array);
|
||||
coop_params[1] = reinterpret_cast<void*>(&second_dev_array);
|
||||
coop_params[2] = reinterpret_cast<void*>(&loops);
|
||||
coop_params[3] = reinterpret_cast<void*>(&width);
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(coop_kernel),
|
||||
total_blocks, warp_size, coop_params,
|
||||
0, streams[0]));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Read back the buffers and print out their data ************************/
|
||||
unsigned int *first_array = (unsigned int*)calloc(width,
|
||||
sizeof(unsigned int));
|
||||
unsigned int *second_array = (unsigned int*)calloc(width,
|
||||
sizeof(unsigned int));
|
||||
HIPCHECK(hipMemcpyAsync(first_array, first_dev_array,
|
||||
width * sizeof(unsigned int),
|
||||
hipMemcpyDeviceToHost, streams[0]));
|
||||
|
||||
HIPCHECK(hipMemcpyAsync(second_array, second_dev_array,
|
||||
width * sizeof(unsigned int),
|
||||
hipMemcpyDeviceToHost, streams[0]));
|
||||
|
||||
std::cout << "Waiting for cooperative work to finish..." << std::endl;
|
||||
std::cout << std::flush;
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(streams[0]));
|
||||
|
||||
|
||||
int ret_val = 0;
|
||||
|
||||
std::cout << "Attemping to verify buffers." << std::endl;
|
||||
std::cout << std::flush;
|
||||
ret_val = verify_coop_arrays(loops, input_buffer, first_array,
|
||||
second_array, width);
|
||||
if (!ret_val) {
|
||||
std::cout << "It appears that inter-thread data sharing at ";
|
||||
std::cout << "grid_group sync points works properly!" << std::endl;
|
||||
} else {
|
||||
flag = 1;
|
||||
}
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
HIPCHECK(hipFree(first_dev_array));
|
||||
HIPCHECK(hipFree(second_dev_array));
|
||||
free(input_buffer);
|
||||
free(first_array);
|
||||
free(second_array);
|
||||
}
|
||||
if (!flag) {
|
||||
passed();
|
||||
} else {
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
Обычный файл → Исполняемый файл
+6
-2
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -139,7 +139,11 @@ int main()
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
passed();
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Обычный файл → Исполняемый файл
+6
-2
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -139,7 +139,11 @@ int main()
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
passed();
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Обычный файл → Исполняемый файл
+6
-2
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -139,7 +139,11 @@ int main()
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
passed();
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Обычный файл → Исполняемый файл
+18
-4
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -34,6 +34,8 @@ THE SOFTWARE.
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
@@ -193,15 +195,27 @@ static void test_cg_multi_grid_group_type(int blockSize)
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
//ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
|
||||
//ASSERT_EQUAL(gridRankTestH[i][j], i);
|
||||
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
+32
-7
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -34,11 +34,14 @@ THE SOFTWARE.
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
|
||||
sizeTestD[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
|
||||
|
||||
@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 5;
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncResultD;
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
|
||||
ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
|
||||
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
|
||||
ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
|
||||
|
||||
+32
-7
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -34,11 +34,14 @@ THE SOFTWARE.
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
|
||||
sizeTestD[gIdx] = group_size(mg);
|
||||
|
||||
// Test thread_rank api
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = thread_rank(mg);
|
||||
|
||||
// Test is_valid api
|
||||
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
|
||||
|
||||
@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 5;
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncResultD;
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
|
||||
ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
hipSuccess);
|
||||
ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
|
||||
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
|
||||
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
|
||||
ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
|
||||
|
||||
ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
|
||||
ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
|
||||
|
||||
Обычный файл → Исполняемый файл
+11
-1
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -166,6 +166,16 @@ int main()
|
||||
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Test block sizes which are powers of 2
|
||||
int i = 0;
|
||||
while (true) {
|
||||
Обычный файл → Исполняемый файл
+11
-1
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -135,6 +135,16 @@ int main()
|
||||
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Test block sizes which are powers of 2
|
||||
int i = 0;
|
||||
while (true) {
|
||||
Обычный файл → Исполняемый файл
+11
-1
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -135,6 +135,16 @@ int main()
|
||||
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Test block sizes which are powers of 2
|
||||
int i = 0;
|
||||
while (true) {
|
||||
+1
-1
@@ -20,7 +20,7 @@ THE SOFTWARE.
|
||||
// Simple test for hipLaunchCooperativeKernelMultiDevice API.
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
+3
-4
@@ -22,15 +22,14 @@ THE SOFTWARE.
|
||||
// Simple test for hipLaunchCooperativeKernel API.
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
#include "hip/hcc_detail/device_library_decls.h"
|
||||
#include "hip/hcc_detail/hip_cooperative_groups.h"
|
||||
#include "hip/hip_cooperative_groups.h"
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
@@ -129,7 +128,7 @@ int main() {
|
||||
params[3] = (void*)&dC;
|
||||
|
||||
std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
|
||||
HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
|
||||
|
||||
HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
|
||||
|
||||
+568
@@ -0,0 +1,568 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*The general idea of the application is to test how Cooperative Groups kernel
|
||||
launches work when launching too many warps to multiple target devices. This
|
||||
tests the following failure modes for hipLaunchCooperativeKernelMultiDevice:
|
||||
1) Do not launch more warps to any device than can fit on that device
|
||||
2) All device targets for the multi-device launch function must be different
|
||||
3) All streams must be explicit (non-NULL)
|
||||
4) The kernels sent in must be identical between devices
|
||||
5) The grid and block sizes must be identical between devices
|
||||
6) The block dimensions must be non-zero
|
||||
7) The dynamic shared memory size must be identical between devices.
|
||||
|
||||
This test ensures that the proper error conditions are returned, even if the
|
||||
target kernel does not actually use any fo the cooperative groups features.
|
||||
|
||||
Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm
|
||||
do not have these constraints. As such, the test checks to see whether they
|
||||
should fail or succeed and compares this to what actually happens.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static inline void hipCheckAndFail(hipError_t errval,
|
||||
const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != hipSuccess) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
failed("");
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << " Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
#define hipCheckErr(errval) \
|
||||
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int multi_gpu_cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
|
||||
|
||||
if (!multi_gpu_cooperative_attribute) {
|
||||
std::cerr << "Multi-GPU cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
|
||||
std::cerr << "Multi-GPU cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int support_for_separate_kernels(int device_id) {
|
||||
hipError_t err;
|
||||
|
||||
int separate_kernel_supported;
|
||||
HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported,
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,
|
||||
device_id));
|
||||
if (!separate_kernel_supported) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int support_for_separate_grid_sizes(int device_id) {
|
||||
hipError_t err;
|
||||
int separate_sizes_supported;
|
||||
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,
|
||||
device_id));
|
||||
if (!separate_sizes_supported) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int support_for_separate_block_dims(int device_id) {
|
||||
hipError_t err;
|
||||
int separate_sizes_supported;
|
||||
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,
|
||||
device_id));
|
||||
if (!separate_sizes_supported) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int support_for_separate_shared_sizes(int device_id) {
|
||||
hipError_t err;
|
||||
int separate_sizes_supported;
|
||||
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,
|
||||
device_id));
|
||||
if (!separate_sizes_supported) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
__global__ void test_kernel(long long *array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += clock64();
|
||||
}
|
||||
|
||||
__global__ void second_test_kernel(long long *array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += clock64();
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
/*************************************************************************/
|
||||
/* Parse the command line parameters *************************************/
|
||||
// Arguments to pull out of the command line.
|
||||
int device_num, FailFlag = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
if (device_num < 2) {
|
||||
std::cout << "This test requires atleast two gpus but the system has ";
|
||||
std::cout << " only "<< device_num <<std::endl;
|
||||
std::cout << "The test is skipping with Pass result" << std::endl;
|
||||
passed();
|
||||
}
|
||||
for (int dev = 0; dev < (device_num-1); ++dev) {
|
||||
std::cout << "First device number: " << dev << std::endl;
|
||||
std::cout << "Second device number: " << (dev + 1) << std::endl;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target devices support cooperative groups ****************/
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!cooperative_groups_support((dev + i))) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* We will try to launch more waves than the GPUs can fit. ***************/
|
||||
int warp_sizes[2];
|
||||
int num_sms[2];
|
||||
hipDeviceProp_t device_properties[2];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
std::cout << "Device " << (dev + i);
|
||||
std::cout << " name: " << device_properties[i].name << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[2];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice((dev + i)));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
int desired_blocks = max_blocks_per_sm * num_sm;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Create the streams we will use in this test. **************************/
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice((dev + i)));
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
|
||||
// Alocate the host input buffer, and two device-focused buffers per GPU
|
||||
// that we will use for our test.
|
||||
unsigned int *good_dev_array[2];
|
||||
unsigned int *bad_dev_array[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int good_size = desired_blocks * warp_size * sizeof(long long);
|
||||
int bad_size = 2 * desired_blocks * warp_size * sizeof(long long);
|
||||
|
||||
HIPCHECK(hipSetDevice((dev + i)));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&good_dev_array[i]),
|
||||
good_size));
|
||||
HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i]));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&bad_dev_array[i]),
|
||||
bad_size));
|
||||
HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i]));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with too many ";
|
||||
std::cout << "warps..." << std::endl;
|
||||
|
||||
void *dev_params[2][1];
|
||||
hipLaunchParams md_params[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
|
||||
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].gridDim = 2 * desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if (err != hipErrorCooperativeLaunchTooLarge) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with too many warps." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorCooperativeLaunchTooLarge (";
|
||||
std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel to the same ";
|
||||
std::cout << "device twice..." << std::endl;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&good_dev_array[i]);
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].stream = streams[0];
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if (err != hipErrorInvalidDevice) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "to the same device twice." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidDevice (";
|
||||
std::cerr << hipErrorInvalidDevice << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
std::cout << "hipErrorInvalidDevice" << std::endl;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel to the NULL ";
|
||||
std::cout << "stream" << std::endl;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].stream = NULL;
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if (err != hipErrorInvalidResourceHandle) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "to the NULL stream." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidResourceHandle (";
|
||||
std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
std::cout << "hipErrorInvalidResourceHandle" << std::endl;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with two ";
|
||||
std::cout << "different kernels." << std::endl;
|
||||
bool supports_sep_kernels = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].stream = streams[i];
|
||||
if (!support_for_separate_kernels((dev + i))) {
|
||||
supports_sep_kernels = false;
|
||||
}
|
||||
}
|
||||
md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if ((supports_sep_kernels && err != hipSuccess) ||
|
||||
(!supports_sep_kernels && err != hipErrorInvalidValue)) {
|
||||
if (supports_sep_kernels) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different kernels." << std::endl;
|
||||
std::cerr << "This SHOULD have succeeded with hipSuccess (";
|
||||
std::cerr << hipSuccess << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different kernels." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidValue (";
|
||||
std::cerr << hipErrorInvalidValue << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
}
|
||||
FailFlag = 1;
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
if (supports_sep_kernels) {
|
||||
std::cout << "hipSuccess" << std::endl;
|
||||
} else {
|
||||
std::cout << "hipErrorInvalidValue" << std::endl;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with two ";
|
||||
std::cout << "different grid sizes." << std::endl;
|
||||
bool supports_sep_sizes = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].gridDim = i+1;
|
||||
if (!support_for_separate_grid_sizes((dev + i))) {
|
||||
supports_sep_sizes = false;
|
||||
}
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if ((supports_sep_sizes && err != hipSuccess) ||
|
||||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
|
||||
if (supports_sep_sizes) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different grid sizes." << std::endl;
|
||||
std::cerr << "This SHOULD have succeeded with hipSuccess (";
|
||||
std::cerr << hipSuccess << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different grid sizes." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidValue (";
|
||||
std::cerr << hipErrorInvalidValue << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
if (supports_sep_kernels) {
|
||||
std::cout << "hipSuccess" << std::endl;
|
||||
} else {
|
||||
std::cout << "hipErrorInvalidValue" << std::endl;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with two ";
|
||||
std::cout << "different block dimensions." << std::endl;
|
||||
supports_sep_sizes = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].blockDim = i+1;
|
||||
if (!support_for_separate_block_dims((dev + i))) {
|
||||
supports_sep_sizes = false;
|
||||
}
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if ((supports_sep_sizes && err != hipSuccess) ||
|
||||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
|
||||
if (supports_sep_sizes) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different block dimensions." << std::endl;
|
||||
std::cerr << "This SHOULD have succeeded with hipSuccess (";
|
||||
std::cerr << hipSuccess << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different block dimensions." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidValue (";
|
||||
std::cerr << hipErrorInvalidValue << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
if (supports_sep_kernels) {
|
||||
std::cout << "hipSuccess" << std::endl;
|
||||
} else {
|
||||
std::cout << "hipErrorInvalidValue" << std::endl;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with block ";
|
||||
std::cout << "dimensions of zero." << std::endl;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].blockDim = 0;
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if (err != hipErrorInvalidConfiguration) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with block dimensions of zero." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidConfiguration (";
|
||||
std::cerr << hipErrorInvalidConfiguration << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
std::cout << "hipErrorInvalidConfiguration" << std::endl;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with two ";
|
||||
std::cout << "different shared memory sizes." << std::endl;
|
||||
supports_sep_sizes = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = i;
|
||||
if (!support_for_separate_shared_sizes((dev + i))) {
|
||||
supports_sep_sizes = false;
|
||||
}
|
||||
}
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if ((supports_sep_sizes && err != hipSuccess) ||
|
||||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
|
||||
if (supports_sep_sizes) {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different shared memory sizes." << std::endl;
|
||||
std::cerr << "This SHOULD have succeeded with hipSuccess (";
|
||||
std::cerr << hipSuccess << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
|
||||
std::cerr << "with two different shared memory sizes." << std::endl;
|
||||
std::cerr << "This SHOULD have failed with the error ";
|
||||
std::cerr << "hipErrorInvalidValue (";
|
||||
std::cerr << hipErrorInvalidValue << ")." << std::endl;
|
||||
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
|
||||
std::cerr << " (" << err << ")" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
} else {
|
||||
std::cout << "\tProperly saw this return ";
|
||||
if (supports_sep_kernels) {
|
||||
std::cout << "hipSuccess" << std::endl;
|
||||
} else {
|
||||
std::cout << "hipErrorInvalidValue" << std::endl;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "Launching a multi-GPU cooperative kernel with maximum ";
|
||||
std::cout << "number of warps..." << std::endl;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].sharedMem = 0;
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
std::cout << "\tProperly launched." << std::endl;
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < 2; ++m) {
|
||||
HIPCHECK(hipFree(good_dev_array[m]));
|
||||
HIPCHECK(hipFree(bad_dev_array[m]));
|
||||
HIPCHECK(hipStreamDestroy(streams[m]));
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
failed("");
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,581 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*The general idea of the application is to test how multi-GPU Cooperative
|
||||
Groups kernel launches to a stream interact with other things that may be
|
||||
simultaneously running in the same streams.
|
||||
|
||||
The HIP specification says that a multi-GPU cooperative launch will wait
|
||||
until all of the streams it's using finish their work. Only then will the
|
||||
cooperative kernel be launched to all of the devices. Then no other work
|
||||
can take part in the any of the streams until all of the multi-GPU
|
||||
cooperative work is done.
|
||||
|
||||
However, there are flags that allow you to disable each of these
|
||||
serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync.
|
||||
|
||||
As such, this benchmark tests the following five situations launching
|
||||
to two GPUs (and thus two streams):
|
||||
|
||||
1. Normal multi-GPU cooperative kernel:
|
||||
This should result in the following pattern:
|
||||
Stream 0: Cooperative
|
||||
Stream 1: Cooperative
|
||||
2. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
with the default flags, resulting in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: --> Cooperative --> Regular
|
||||
|
||||
3. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off "pre-sync". This should allow a cooperative kernel
|
||||
to launch even if work is already in a stream pointing to
|
||||
another GPU.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: Cooperative --> Regular
|
||||
|
||||
4. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off "post-sync". This should allow a new kernel to enter
|
||||
a GPU even if another GPU still has a cooperative kernel on it.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: --> Cooperative--> Regular
|
||||
|
||||
5. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off both pre- and post-sync. This should allow any of
|
||||
the kernels to launch to their GPU regardless of the status of
|
||||
other kernels in other multi-GPU stream groups.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: Cooperative --> Regular
|
||||
|
||||
We time how long it takes to run each of these benchmarks and print it as
|
||||
the output of the benchmark. The kernels themselves are just useless time-
|
||||
wasting code so that the kernel takes a meaningful amount of time on the
|
||||
GPU before it exits. We only launch a single wavefront for each kernel, so
|
||||
any serialization should not be because of GPU occupancy concerns.
|
||||
|
||||
If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
|
||||
cooperative kernels are serialized as expected.
|
||||
|
||||
If test #5 takes roughly twice as long as #1, that implies that the
|
||||
overlap-allowing flags work as expected.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static inline void hipCheckAndFail(hipError_t errval,
|
||||
const char *file, int line) {
|
||||
hipError_t last_err = hipGetLastError();
|
||||
if (errval != hipSuccess) {
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
failed("");
|
||||
}
|
||||
if (last_err != errval) {
|
||||
std::cerr << "Error: the return value of a function was not the same ";
|
||||
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
std::cerr << " Function returned: " << hipGetErrorString(errval);
|
||||
std::cerr << " (" << errval << ")" << std::endl;
|
||||
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
|
||||
std::cerr << " (" << last_err << ")" << std::endl;
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
#define hipCheckErr(errval) \
|
||||
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int multi_gpu_cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
|
||||
if (!multi_gpu_cooperative_attribute) {
|
||||
std::cerr << "Multi-GPU cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
|
||||
std::cerr << "Multi-GPU cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel(unsigned int loops, long long *array,
|
||||
int fast_gpu) {
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+1000000)) {}
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+1000000)) {}
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num, FailFlag = 0;
|
||||
uint32_t loops = 2000;
|
||||
uint32_t fast_loops = 1;
|
||||
int32_t fast_gpu = -1;
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
if (device_num < 2) {
|
||||
std::cout << "This test requires atleast two gpus but the system has ";
|
||||
std::cout << " only "<< device_num <<std::endl;
|
||||
std::cout << "The test is skipping with Pass result" << std::endl;
|
||||
passed();
|
||||
}
|
||||
for (int dev = 0; dev < (device_num-1); ++dev) {
|
||||
std::cout << "First device number: " << dev << std::endl;
|
||||
std::cout << "Second device number: " << (dev + 1) << std::endl;
|
||||
std::cout << "Loops: " << loops << std::endl;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target devices support cooperative groups ****************/
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!cooperative_groups_support(dev + i)) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* We will launch enough waves to fill up all of the GPU *****************/
|
||||
int warp_sizes[2];
|
||||
int num_sms[2];
|
||||
hipDeviceProp_t device_properties[2];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
std::cout << "Device " << (i + 1);
|
||||
std::cout << " name: " << device_properties[i].name << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[2];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
int desired_blocks = 1;
|
||||
|
||||
if (desired_blocks > max_blocks_per_sm * num_sm) {
|
||||
std::cerr << "The requested number of blocks will not fit on the GPU";
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "You requested " << desired_blocks << " but we can only ";
|
||||
std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl;
|
||||
failed("");
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Create the streams we will use in this test. **************************/
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernelx **********************************/
|
||||
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned long long *dev_array[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int good_size = desired_blocks * warp_size * sizeof(long long);
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
|
||||
HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
void *dev_params[2][3];
|
||||
hipLaunchParams md_params[2];
|
||||
std::chrono::time_point<std::chrono::system_clock> start_time[6];
|
||||
std::chrono::time_point<std::chrono::system_clock> end_time[6];
|
||||
|
||||
std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n";
|
||||
std::cout << "This should result in the following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
|
||||
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
start_time[0] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[0] = std::chrono::system_clock::now();
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the ";
|
||||
std::cout << "following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
|
||||
std::cout << "GPU " << (dev + 1) << ": --> Coop ";
|
||||
std::cout << "--> Standard Kernel\n";
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
fast_gpu = -1;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the ";
|
||||
std::cout << "following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl;
|
||||
std::cout << "GPU " << (dev + 1) << ": --> Long Coop";
|
||||
std::cout << " Kernel --> ";
|
||||
std::cout << "Standard Kernel\n";
|
||||
fast_gpu = 0;
|
||||
start_time[2] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[2] = std::chrono::system_clock::now();
|
||||
fast_gpu = -1;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the ";
|
||||
std::cout << "ability to overlap regular and cooperative kernels ";
|
||||
std::cout << "only at the beginning." << std::endl;
|
||||
std::cout << "This should result in the following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl;
|
||||
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
|
||||
std::cout<< " Kernel\n";
|
||||
fast_gpu = 0;
|
||||
start_time[3] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[3] = std::chrono::system_clock::now();
|
||||
fast_gpu = -1;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the ";
|
||||
std::cout << "ability to overlap regular and cooperative kernels ";
|
||||
std::cout << "only at the end." << std::endl;
|
||||
std::cout << "This should result in the following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
|
||||
std::cout << "GPU " << (dev + 1) << ": --> Coop --> ";
|
||||
std::cout << "Standard Kernel\n";
|
||||
fast_gpu = 1;
|
||||
start_time[4] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[4] = std::chrono::system_clock::now();
|
||||
fast_gpu = -1;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the ";
|
||||
std::cout << "ability to overlap regular and cooperative kernels";
|
||||
std::cout << std::endl;
|
||||
std::cout << "This should result in the following pattern:" << std::endl;
|
||||
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
|
||||
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
|
||||
std::cout << " Kernel\n";
|
||||
start_time[5] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync |
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[5] = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> single_kernel_time =
|
||||
(end_time[0] - start_time[0]);
|
||||
std::chrono::duration<double> serialized_gpu0_time =
|
||||
(end_time[1] - start_time[1]);
|
||||
std::chrono::duration<double> serialized_gpu1_time =
|
||||
(end_time[2] - start_time[2]);
|
||||
std::chrono::duration<double> pre_overlapped_time =
|
||||
(end_time[3] - start_time[3]);
|
||||
std::chrono::duration<double> post_overlapped_time =
|
||||
(end_time[4] - start_time[4]);
|
||||
std::chrono::duration<double> overlapped_time =
|
||||
(end_time[5] - start_time[5]);
|
||||
|
||||
std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl;
|
||||
std::cout << " " << single_kernel_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 1: Serialized set of three kernels with GPU0";
|
||||
std::cout << " being long took:";
|
||||
std::cout << " " << serialized_gpu0_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
|
||||
std::cerr << " and ";
|
||||
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 2: Serialized set of three kernels with GPU1";
|
||||
std::cout << " being long took:" << std::endl;
|
||||
std::cout << " " << serialized_gpu1_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
|
||||
std::cerr << " and ";
|
||||
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n";
|
||||
std::cout << " " << pre_overlapped_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
|
||||
std::cerr << " and ";
|
||||
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n";
|
||||
std::cout << " " << post_overlapped_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
|
||||
std::cerr << " and ";
|
||||
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.";
|
||||
std::cout << std::endl;
|
||||
std::cout << "Test 5: Multiple kernels with overlap allowed took:\n";
|
||||
std::cout << " " << overlapped_time.count();
|
||||
std::cout << " seconds" << std::endl;
|
||||
std::cerr << "Expect between " << (1.8 * single_kernel_time.count());
|
||||
std::cerr << " and ";
|
||||
std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
|
||||
|
||||
// Test that fully not-overlapped kernels take roughly 3x as long as one
|
||||
// cooperative kernel.
|
||||
if (serialized_gpu0_time > 3.3 * single_kernel_time ||
|
||||
serialized_gpu0_time < 2.7 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Test 1, the first case where all kernels should be ";
|
||||
std::cerr << "serialized, had a runtime that was very different ";
|
||||
std::cerr << "than what was expected." << std::endl;
|
||||
std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n";
|
||||
std::cerr << "Expected between ";
|
||||
std::cerr << (2.7 * single_kernel_time.count()) << " and ";
|
||||
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cerr << "Were they truly serialized?" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
|
||||
// Test that fully not-overlapped kernels take roughly 3x as long as one
|
||||
// cooperative kernel.
|
||||
if (serialized_gpu1_time > 3.3 * single_kernel_time ||
|
||||
serialized_gpu1_time < 2.7 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Test 2, the second case where all kernels should be ";
|
||||
std::cerr << "serialized, had a runtime that was very different ";
|
||||
std::cerr << "than what was expected." << std::endl;
|
||||
std::cerr << "Was " << serialized_gpu1_time.count();
|
||||
std::cerr << " seconds." << std::endl;
|
||||
std::cerr << "Expected between ";
|
||||
std::cerr << (2.7 * single_kernel_time.count()) << " and ";
|
||||
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cerr << "Were they truly serialized?" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
|
||||
// Test that kernels that can overlap only before the cooperative kernel
|
||||
// launches kernels take roughly the same time (in this case)
|
||||
if (pre_overlapped_time > 2.3 * single_kernel_time ||
|
||||
pre_overlapped_time < 1.7 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Test 3, the case where the last kernel is serialized, had ";
|
||||
std::cerr << "a runtime that was very different than what was ";
|
||||
std::cerr << "expected." << std::endl;
|
||||
std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n";
|
||||
std::cerr << "Expected between ";
|
||||
std::cerr << (1.7 * single_kernel_time.count()) << " and ";
|
||||
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
FailFlag = 1;
|
||||
}
|
||||
|
||||
// Test that kernels that can overlap only after the cooperative kernel
|
||||
// launches kernels take roughly the same time (in this case)
|
||||
if (post_overlapped_time > 2.3 * single_kernel_time ||
|
||||
post_overlapped_time < 1.7 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Teste 4, the case where the first kernel is ";
|
||||
std::cerr << "serialized, had a runtime that was very different ";
|
||||
std::cerr << "than what was expected." << std::endl;
|
||||
std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n";
|
||||
std::cerr << "Expected between ";
|
||||
std::cerr << (1.7 * single_kernel_time.count()) << " and ";
|
||||
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
|
||||
FailFlag = 1;
|
||||
}
|
||||
|
||||
// Test that, with the right flags on the kernel launch, that we prevent
|
||||
// incomplete launches from serializing the cooperative launch streams.
|
||||
if (overlapped_time > 2.2 * single_kernel_time ||
|
||||
overlapped_time < 1.8 * single_kernel_time) {
|
||||
std::cerr << "ERROR!" << std::endl;
|
||||
std::cerr << "Test 5, the case where normal and cooperative kernel ";
|
||||
std::cerr << "launches should overlap, does not appear to have done so.";
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "Was " << overlapped_time.count() << " seconds.\n";
|
||||
std::cerr << "Expected between ";
|
||||
std::cerr << (1.8 * single_kernel_time.count()) << " and ";
|
||||
std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
|
||||
std::cerr << "Is the normal kernel being serialized with the ";
|
||||
std::cerr << "cooperative kernels on different streams?" << std::endl;
|
||||
FailFlag = 1;
|
||||
}
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
HIPCHECK(hipFree(dev_array[k]));
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (FailFlag == 1) {
|
||||
failed("");
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
}
|
||||
+374
@@ -0,0 +1,374 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Test Description:
|
||||
/*The general idea of the application is to launch N warps to all GPUs detected
|
||||
in the HIP system. N is a command-line parameter, but the user should set N
|
||||
small enough that all warps can be on each of the GPUs at the same time.
|
||||
|
||||
All of the warps do a "work loop". Within the work loop, every warp
|
||||
atomically increments a global variable that is shared between both fo the
|
||||
target GPUs. The value returned from this atomic increment entriely depends
|
||||
on the order the warps from the GPUs arrive at the atomic instruction. Each
|
||||
warp then stores the result into a global array based on its warp ID.
|
||||
|
||||
We also add a sleep/wait loop into the code so that the last warp runs much
|
||||
slower than everyone else. As such, it should store much larger values than
|
||||
all the other warps.
|
||||
|
||||
If there are no barrier within the loop, then warp 0 will likely ge to the
|
||||
global variable the first time while all the other warps have each
|
||||
incremented it many times. If the barrier properly works, then each warp
|
||||
will increment the variable once per time through the loop, and all threads
|
||||
will sleep on the barrier waiting for the last warp to finally catch up.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int multi_gpu_cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
|
||||
if (!multi_gpu_cooperative_attribute) {
|
||||
std::cerr << "Multi-GPU cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
|
||||
std::cerr << "Multi-GPU cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
|
||||
unsigned int *host_buffer,
|
||||
unsigned int num_devs) {
|
||||
unsigned int max_in_this_loop = 0;
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
max_in_this_loop += (warps * num_devs);
|
||||
for (unsigned int j = 0; j < warps; j++) {
|
||||
if (host_buffer[i*warps+j] > max_in_this_loop) {
|
||||
std::cerr << "Barrier failure!" << std::endl;
|
||||
std::cerr << " Buffer entry " << i*warps+j;
|
||||
std::cerr << " contains the value " << host_buffer[i*warps+j];
|
||||
std::cerr << " but it should not be more than ";
|
||||
std::cerr << max_in_this_loop << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "\tBarriers work properly!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
|
||||
unsigned int desired_val = 0;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
if (i % 2 == 0) {
|
||||
desired_val += 2;
|
||||
} else {
|
||||
desired_val *= 2;
|
||||
}
|
||||
}
|
||||
std::cout << "Desired value is " << desired_val << std::endl;
|
||||
if (array_val != desired_val) {
|
||||
std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "Expected the multi-GPUs to work together to produce ";
|
||||
std::cerr << "the value " << desired_val << std::endl;
|
||||
std::cerr << "However, the entry returned from the multi-GPU ";
|
||||
std::cerr << "kernel was " << array_val << std::endl;
|
||||
return -1;
|
||||
}
|
||||
std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel(unsigned int *atomic_val, unsigned int *global_array,
|
||||
unsigned int *array, uint32_t loops) {
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+1000000)) {}
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+100000000)) {}
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int num_devices = 0;
|
||||
uint32_t loops = 2;
|
||||
uint32_t warps = 10;
|
||||
uint32_t block_size = 1;
|
||||
|
||||
std::cout << "Loops: " << loops << std::endl;
|
||||
std::cout << "Warps: " << warps << std::endl;
|
||||
std::cout << "Block size: " << block_size << std::endl;
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&num_devices));
|
||||
if (num_devices < 2) {
|
||||
std::cout << "Not enough GPUs to run test." << std::endl;
|
||||
std::cout << "We require at least 2 GPUs, but only found ";
|
||||
std::cout << num_devices << std::endl;
|
||||
std::cout << "Skipping the test with PASSED result\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
uint32_t device_num[num_devices];
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
device_num[i] = i;
|
||||
if (!cooperative_groups_support(device_num[i])) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether the requested size will fit on the GPU *******************/
|
||||
int warp_sizes[num_devices];
|
||||
int num_sms[num_devices];
|
||||
hipDeviceProp_t device_properties[num_devices];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i]));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
std::cout << "Device " << (i + 1);
|
||||
std::cout << " name: " << device_properties[i].name << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[num_devices];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIPCHECK(hipSetDevice(device_num[i]));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
if (requested_blocks > max_blocks_per_sm * num_sm) {
|
||||
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
|
||||
std::cerr << "but we can only guarantee to simultaneously run ";
|
||||
std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
|
||||
failed("");
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
// Each block will output a single value per loop.
|
||||
uint32_t total_buffer_len = requested_blocks*loops;
|
||||
|
||||
// Alocate the buffer that will hold the kernel's output, and which will
|
||||
// also be used to globally synchronize during GWS initialization
|
||||
unsigned int *host_buffer[num_devices];
|
||||
unsigned int *kernel_buffer[num_devices];
|
||||
unsigned int *kernel_atomic[num_devices];
|
||||
hipStream_t streams[num_devices];
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
|
||||
sizeof(unsigned int));
|
||||
HIPCHECK(hipSetDevice(device_num[i]));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
|
||||
total_buffer_len * sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
|
||||
total_buffer_len * sizeof(unsigned int),
|
||||
hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
|
||||
sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Single kernel atomic shared between both devices; put it on the host
|
||||
unsigned int* global_array;
|
||||
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
|
||||
num_devices * sizeof(unsigned int), 0));
|
||||
HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
std::cout << "Launching a kernel with " << warps << " warps ";
|
||||
std::cout << "in " << requested_blocks << " thread blocks.";
|
||||
std::cout << std::endl;
|
||||
|
||||
void *dev_params[num_devices][4];
|
||||
hipLaunchParams md_params[num_devices];
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
|
||||
dev_params[i][3] = reinterpret_cast<void*>(&loops);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].gridDim = requested_blocks;
|
||||
md_params[i].blockDim = num_threads_in_block;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
/*************************************************************************/
|
||||
/* Read back the buffers and print out its data **************************/
|
||||
for (int dev = 0; dev < num_devices; dev++) {
|
||||
HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
|
||||
total_buffer_len * sizeof(unsigned int),
|
||||
hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
for (int dev = 0; dev < num_devices; dev++) {
|
||||
std::cout << "+++++++++++++++++ Device " << dev;
|
||||
std::cout << "+++++++++++++++++" << std::endl;
|
||||
for (unsigned int j = 0; j < requested_blocks; j++) {
|
||||
std::cout << "Buffer entry " << (i*warps+j);
|
||||
std::cout << " (written by warp " << j << ")";
|
||||
std::cout << " is " << host_buffer[dev][i*requested_blocks+j];
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << "==========================\n";
|
||||
}
|
||||
for (unsigned int dev = 0; dev < num_devices; dev++) {
|
||||
std::cout << "Testing output from device " << dev << std::endl;
|
||||
int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
|
||||
host_buffer[dev], num_devices);
|
||||
if (local_ret_val) {
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl << "The multi-GPU shared updates contain:\n";
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
std::cout << "Entry " << i << ": ";
|
||||
std::cout << global_array[i] << std::endl;
|
||||
}
|
||||
int flag = 0;
|
||||
for (int dev = 0; dev < num_devices; dev++) {
|
||||
std::cout << "Testing multi-GPU output for entry " << dev << std::endl;
|
||||
int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
|
||||
if (local_ret_val) {
|
||||
flag = 1;
|
||||
}
|
||||
}
|
||||
for (int k = 0; k < num_devices; ++k) {
|
||||
HIPCHECK(hipFree(kernel_buffer[k]));
|
||||
HIPCHECK(hipFree(kernel_atomic[k]));
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
free(host_buffer[k]);
|
||||
}
|
||||
if (flag == 1) {
|
||||
failed("");
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
}
|
||||
+233
@@ -0,0 +1,233 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Test Description:
|
||||
/*The general idea of the application is to launch N warps. N is a command-line
|
||||
parameter, but the user should set N small enough that all warps can be on
|
||||
the GPU at the same time.
|
||||
|
||||
All of the warps do a "work loop". Within the work loop, every warp
|
||||
atomically increments a global variable. The value returned from this atomic
|
||||
increment entriely depends on the order the threads arrive at the atomic
|
||||
instruction. Each warp then stores the result into a global array based on its
|
||||
warp ID.
|
||||
|
||||
We also add a sleep/wait loop into the code so that the last warp runs much
|
||||
slower than everyone else. As such, it should store much larger values than
|
||||
all the other warps.
|
||||
|
||||
If there are no barrier within the loop, then the last warp will likely get to
|
||||
the global variable the first time after all the other warps have each
|
||||
incremented it many times. If the barrier properly works, then each warp
|
||||
will increment the variable once per time through the loop, and all threads
|
||||
will sleep on the barrier waiting for the last warp to finally catch up.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
|
||||
unsigned int *host_buffer) {
|
||||
unsigned int max_in_this_loop = 0;
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
max_in_this_loop += warps;
|
||||
for (unsigned int j = 0; j < warps; j++) {
|
||||
if (host_buffer[i*warps+j] > max_in_this_loop) {
|
||||
std::cerr << "Barrier failure!" << std::endl;
|
||||
std::cerr << " Buffer entry " << i*warps+j;
|
||||
std::cerr << " contains the value " << host_buffer[i*warps+j];
|
||||
std::cerr << " but it should not be more than ";
|
||||
std::cerr << max_in_this_loop << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "Barriers work properly!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel(unsigned int *atomic_val, unsigned int *array,
|
||||
unsigned int loops) {
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock+1000000)) {}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num;
|
||||
uint32_t loops = 2;
|
||||
uint32_t warps = 10;
|
||||
uint32_t block_size = 1;
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
for (int dev = 0; dev < device_num; ++dev) {
|
||||
std::cout << "Device number: " << dev << std::endl;
|
||||
std::cout << "Loops: " << loops << std::endl;
|
||||
std::cout << "Warps: " << warps << std::endl;
|
||||
std::cout << "Block size: " << block_size << std::endl;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (!cooperative_groups_support(dev)) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether the requested size will fit on the GPU *******************/
|
||||
int warp_size;
|
||||
int num_sms;
|
||||
int max_blocks_per_sm;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
warp_size = device_properties.warpSize;
|
||||
num_sms = device_properties.multiProcessorCount;
|
||||
|
||||
std::cout << "Device name: " << device_properties.name << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
|
||||
test_kernel, num_threads_in_block, 0));
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
if (requested_blocks > max_blocks_per_sm * num_sms) {
|
||||
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
|
||||
std::cerr << "but we can only guarantee to simultaneously run ";
|
||||
std::cerr << (max_blocks_per_sm * num_sms) << std::endl;
|
||||
failed("");
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
// Each block will output a single value per loop.
|
||||
uint32_t total_buffer_len = requested_blocks*loops;
|
||||
|
||||
// Alocate the buffer that will hold the kernel's output, and which will
|
||||
// also be used to globally synchronize during GWS initialization
|
||||
unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len,
|
||||
sizeof(unsigned int));
|
||||
|
||||
unsigned int *kernel_buffer;
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer),
|
||||
total_buffer_len * sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemcpy(kernel_buffer, host_buffer,
|
||||
total_buffer_len * sizeof(unsigned int),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
unsigned int *kernel_atomic;
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic),
|
||||
sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernel *****************************************************/
|
||||
std::cout << "Launching a kernel with " << warps << " warps ";
|
||||
std::cout << "in " << requested_blocks << " thread blocks.";
|
||||
std::cout << std::endl;
|
||||
|
||||
void *params[3];
|
||||
params[0] = reinterpret_cast<void*>(&kernel_atomic);
|
||||
params[1] = reinterpret_cast<void*>(&kernel_buffer);
|
||||
params[2] = reinterpret_cast<void*>(&loops);
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
requested_blocks,
|
||||
num_threads_in_block, params, 0, NULL));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Read back the buffer and print out its data****************************/
|
||||
HIPCHECK(hipMemcpy(host_buffer, kernel_buffer,
|
||||
total_buffer_len * sizeof(unsigned int),
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
for (unsigned int j = 0; j < requested_blocks; j++) {
|
||||
std::cout << "Buffer entry " << (i*warps+j);
|
||||
std::cout << " (written by warp " << j << ")";
|
||||
std::cout << " is " << host_buffer[i * requested_blocks + j];
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "==========================\n";
|
||||
}
|
||||
int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer);
|
||||
HIPCHECK(hipFree(kernel_buffer));
|
||||
HIPCHECK(hipFree(kernel_atomic));
|
||||
if (ret_val == -1) {
|
||||
failed("");
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
}
|
||||
}
|
||||
+374
@@ -0,0 +1,374 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Test Description:
|
||||
/*The general idea of the application is to launch N warps to each of two GPUs.
|
||||
N is a command-line parameter, but the user should set N small enough that all
|
||||
warps can be on each of the GPUs at the same time.
|
||||
|
||||
All of the warps do a "work loop". Within the work loop, every warp
|
||||
atomically increments a global variable that is shared between both fo the
|
||||
target GPUs. The value returned from this atomic increment entriely depends
|
||||
on the order the warps from the GPUs arrive at the atomic instruction. Each
|
||||
warp then stores the result into a global array based on its warp ID.
|
||||
|
||||
We also add a sleep/wait loop into the code so that the last warp runs much
|
||||
slower than everyone else. As such, it should store much larger values than
|
||||
all the other warps.
|
||||
|
||||
If there are no barrier within the loop, then warp 0 will likely ge to the
|
||||
global variable the first time while all the other warps have each
|
||||
incremented it many times. If the barrier properly works, then each warp
|
||||
will increment the variable once per time through the loop, and all threads
|
||||
will sleep on the barrier waiting for the last warp to finally catch up.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include "test_common.h"
|
||||
|
||||
static int cooperative_groups_support(int device_id) {
|
||||
hipError_t err;
|
||||
int cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeLaunch, device_id));
|
||||
if (!cooperative_attribute) {
|
||||
std::cerr << "Cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int multi_gpu_cooperative_attribute;
|
||||
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
|
||||
if (!multi_gpu_cooperative_attribute) {
|
||||
std::cerr << "Multi-GPU cooperative launch support not available in ";
|
||||
std::cerr << "the device attribute for device " << device_id;
|
||||
std::cerr << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cerr << "Cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
|
||||
std::cerr << "Multi-GPU cooperative group support not available in ";
|
||||
std::cerr << "device properties." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
|
||||
unsigned int *host_buffer,
|
||||
unsigned int num_devs) {
|
||||
unsigned int max_in_this_loop = 0;
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
max_in_this_loop += (warps * num_devs);
|
||||
for (unsigned int j = 0; j < warps; j++) {
|
||||
if (host_buffer[i*warps+j] > max_in_this_loop) {
|
||||
std::cerr << "Barrier failure!" << std::endl;
|
||||
std::cerr << " Buffer entry " << i*warps+j;
|
||||
std::cerr << " contains the value " << host_buffer[i*warps+j];
|
||||
std::cerr << " but it should not be more than ";
|
||||
std::cerr << max_in_this_loop << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "\tBarriers work properly!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
|
||||
unsigned int desired_val = 0;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
if (i % 2 == 0) {
|
||||
desired_val += 2;
|
||||
} else {
|
||||
desired_val *= 2;
|
||||
}
|
||||
}
|
||||
std::cout << "Desired value is " << desired_val << std::endl;
|
||||
if (array_val != desired_val) {
|
||||
std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "Expected the multi-GPUs to work together to produce ";
|
||||
std::cerr << "the value " << desired_val << std::endl;
|
||||
std::cerr << "However, the entry returned from the multi-GPU ";
|
||||
std::cerr << "kernel was " << array_val << std::endl;
|
||||
return -1;
|
||||
}
|
||||
std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel(unsigned int *atomic_val, unsigned int *global_array,
|
||||
unsigned int *array, uint32_t loops) {
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock + 1000000)) {}
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long start_clock = clock64();
|
||||
while (clock64() < (start_clock + 100000000)) {}
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num = 0, flag = 0;
|
||||
uint32_t loops = 2;
|
||||
uint32_t warps = 10;
|
||||
uint32_t block_size = 1;
|
||||
HIPCHECK(hipGetDeviceCount(&device_num));
|
||||
if (device_num < 2) {
|
||||
std::cout << "This test needs atleast two gpus but found only";
|
||||
std::cout << device_num << std::endl;
|
||||
std::cout << "Hence skipping the test with pass result\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
for (int d = 0; d < (device_num - 1); ++d) {
|
||||
std::cout << "First device number: " << d << std::endl;
|
||||
std::cout << "Second device number: " << (d + 1) << std::endl;
|
||||
std::cout << "Loops: " << loops << std::endl;
|
||||
std::cout << "Warps: " << warps << std::endl;
|
||||
std::cout << "Block size: " << block_size << std::endl;
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether target device supports cooperative groups ****************/
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!cooperative_groups_support((d + i))) {
|
||||
std::cout << "Skipping the test with Pass result.\n";
|
||||
passed();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Test whether the requested size will fit on the GPU *******************/
|
||||
int warp_sizes[2];
|
||||
int num_sms[2];
|
||||
hipDeviceProp_t device_properties[2];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i)));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
std::cout << "Device " << (d + i);
|
||||
std::cout << " name: " << device_properties[i].name << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[2];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice((d + i)));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
|
||||
0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
if (requested_blocks > max_blocks_per_sm * num_sm) {
|
||||
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
|
||||
std::cerr << "but we can only guarantee to simultaneously run ";
|
||||
std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
|
||||
failed("");
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Set up data to pass into the kernel ***********************************/
|
||||
// Each block will output a single value per loop.
|
||||
uint32_t total_buffer_len = requested_blocks*loops;
|
||||
|
||||
// Alocate the buffer that will hold the kernel's output, and which will
|
||||
// also be used to globally synchronize during GWS initialization
|
||||
unsigned int *host_buffer[2];
|
||||
unsigned int *kernel_buffer[2];
|
||||
unsigned int *kernel_atomic[2];
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
|
||||
sizeof(unsigned int));
|
||||
HIPCHECK(hipSetDevice((d + i)));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
|
||||
total_buffer_len * sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
|
||||
total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
|
||||
sizeof(unsigned int)));
|
||||
HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Single kernel atomic shared between both devices; put it on the host
|
||||
unsigned int* global_array;
|
||||
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
|
||||
2 * sizeof(unsigned int), 0));
|
||||
HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int)));
|
||||
|
||||
/*************************************************************************/
|
||||
/* Launch the kernels ****************************************************/
|
||||
std::cout << "Launching a kernel with " << warps << " warps ";
|
||||
std::cout << "in " << requested_blocks << " thread blocks.";
|
||||
std::cout << std::endl;
|
||||
|
||||
void *dev_params[2][4];
|
||||
hipLaunchParams md_params[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
|
||||
dev_params[i][3] = reinterpret_cast<void*>(&loops);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].gridDim = requested_blocks;
|
||||
md_params[i].blockDim = num_threads_in_block;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
/*************************************************************************/
|
||||
/* Read back the buffers and print out its data **************************/
|
||||
for (int dev = 0; dev < 2; dev++) {
|
||||
HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev],
|
||||
total_buffer_len * sizeof(unsigned int),
|
||||
hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
for (int dev = 0; dev < 2; dev++) {
|
||||
std::cout << "+++++++++++++++++ Device " << (d + dev);
|
||||
std::cout << "+++++++++++++++++" << std::endl;
|
||||
for (unsigned int j = 0; j < requested_blocks; j++) {
|
||||
std::cout << "Buffer entry " << (i * warps + j);
|
||||
std::cout << " (written by warp " << j << ")";
|
||||
std::cout << " is " << host_buffer[dev][i * requested_blocks + j];
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << "==========================\n";
|
||||
}
|
||||
for (unsigned int dev = 0; dev < 2; dev++) {
|
||||
std::cout << "Testing output from device " << (d + dev) << std::endl;
|
||||
int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
|
||||
host_buffer[dev], 2);
|
||||
if (local_ret_val == -1) {
|
||||
flag = 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl << "The multi-GPU shared updates contain:";
|
||||
std::cout << std::endl;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
std::cout << "Entry " << i << ": ";
|
||||
std::cout << global_array[i] << std::endl;
|
||||
}
|
||||
for (int dev = 0; dev < 2; dev++) {
|
||||
std::cout << "Testing multi-GPU output for entry " << (d + dev);
|
||||
std::cout << std::endl;
|
||||
int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
|
||||
if (local_ret_val) {
|
||||
flag = 1;
|
||||
}
|
||||
}
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
HIPCHECK(hipFree(kernel_buffer[k]));
|
||||
HIPCHECK(hipFree(kernel_atomic[k]));
|
||||
HIPCHECK(hipStreamDestroy(streams[k]));
|
||||
free(host_buffer[k]);
|
||||
}
|
||||
}
|
||||
if (flag == 1) {
|
||||
failed("");
|
||||
} else {
|
||||
passed();
|
||||
}
|
||||
}
|
||||
@@ -1,173 +1,173 @@
|
||||
/*
|
||||
* Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Test to compare
|
||||
* 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
|
||||
* 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
|
||||
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define MAX_DEVICE_LENGTH 20
|
||||
|
||||
static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
|
||||
for (int i = 0; i < deviceCount; i++) {
|
||||
HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool comparePciBusIDWithHipDeviceGetAttribute() {
|
||||
bool testResult = true;
|
||||
int deviceCount = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCount));
|
||||
HIPASSERT(deviceCount != 0);
|
||||
printf("No.of gpus in the system: %d\n", deviceCount);
|
||||
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
|
||||
getPciBusId(deviceCount, hipDeviceList);
|
||||
|
||||
for (int i = 0; i < deviceCount; i++) {
|
||||
int pciBusID = -1;
|
||||
int pciDeviceID = -1;
|
||||
int pciDomainID = -1;
|
||||
int tempPciBusId = -1;
|
||||
sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
|
||||
&pciDeviceID);
|
||||
HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
|
||||
if (pciBusID != tempPciBusId) {
|
||||
testResult = false;
|
||||
printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
|
||||
"hipDeviceGetAttribute for gpu %d\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
printf("pciBusID output of both hipDeviceGetPCIBusId and"
|
||||
" hipDeviceGetAttribute matched for all gpus\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
bool compareHipDeviceGetPCIBusIdWithLspci() {
|
||||
FILE *fpipe;
|
||||
bool testResult = false;
|
||||
|
||||
{
|
||||
// Check if lspci is installed, if not, don't proceed
|
||||
char const *cmd = "lspci --version";
|
||||
char *lspciCheck;
|
||||
char temp[20];
|
||||
fpipe = popen(cmd, "r");
|
||||
|
||||
if (fpipe == nullptr) {
|
||||
printf("Unable to create command file\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
lspciCheck = fgets(temp, 20, fpipe);
|
||||
pclose(fpipe);
|
||||
|
||||
if (!lspciCheck) {
|
||||
printf("lspci not found. Skipping the test\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
int deviceCount = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCount));
|
||||
HIPASSERT(deviceCount != 0);
|
||||
printf("No.of gpus in the system: %d\n", deviceCount);
|
||||
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
|
||||
getPciBusId(deviceCount, hipDeviceList);
|
||||
|
||||
// Get lspci device list and compare with hip device list
|
||||
#if defined(__CUDA_ARCH__)
|
||||
char const *command = "lspci -D | grep controller | grep NVIDIA | "
|
||||
"cut -d ' ' -f 1";
|
||||
#else
|
||||
char const *command = "lspci -D | grep controller | grep AMD/ATI | "
|
||||
"cut -d ' ' -f 1";
|
||||
#endif
|
||||
fpipe = popen(command, "r");
|
||||
|
||||
if (fpipe == nullptr) {
|
||||
printf("Unable to create command file\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
int deviceMatchCount = 0;
|
||||
|
||||
while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
|
||||
bool bMatchFound = false;
|
||||
for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
|
||||
if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
|
||||
deviceMatchCount++;
|
||||
bMatchFound = true;
|
||||
}
|
||||
}
|
||||
if (bMatchFound == false) {
|
||||
printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
|
||||
pclose(fpipe);
|
||||
|
||||
if (deviceMatchCount == deviceCount) {
|
||||
printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
|
||||
"matched for all gpus\n");
|
||||
testResult = true;
|
||||
} else {
|
||||
printf("Mismatch in number GPUs reported by HIP with lscpi\n");
|
||||
}
|
||||
return testResult;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
bool testResult = true;
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
if (p_tests & 0x1) {
|
||||
testResult &= comparePciBusIDWithHipDeviceGetAttribute();
|
||||
}
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
#ifdef __unix__
|
||||
testResult &= compareHipDeviceGetPCIBusIdWithLspci();
|
||||
#else
|
||||
printf("Detected non-linux OS. Skipping the test\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
if (testResult) {
|
||||
passed();
|
||||
} else {
|
||||
failed("one or more tests failed\n");
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Test to compare
|
||||
* 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
|
||||
* 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
|
||||
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define MAX_DEVICE_LENGTH 20
|
||||
|
||||
static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
|
||||
for (int i = 0; i < deviceCount; i++) {
|
||||
HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool comparePciBusIDWithHipDeviceGetAttribute() {
|
||||
bool testResult = true;
|
||||
int deviceCount = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCount));
|
||||
HIPASSERT(deviceCount != 0);
|
||||
printf("No.of gpus in the system: %d\n", deviceCount);
|
||||
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
|
||||
getPciBusId(deviceCount, hipDeviceList);
|
||||
|
||||
for (int i = 0; i < deviceCount; i++) {
|
||||
int pciBusID = -1;
|
||||
int pciDeviceID = -1;
|
||||
int pciDomainID = -1;
|
||||
int tempPciBusId = -1;
|
||||
sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
|
||||
&pciDeviceID);
|
||||
HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
|
||||
if (pciBusID != tempPciBusId) {
|
||||
testResult = false;
|
||||
printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
|
||||
"hipDeviceGetAttribute for gpu %d\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
printf("pciBusID output of both hipDeviceGetPCIBusId and"
|
||||
" hipDeviceGetAttribute matched for all gpus\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
bool compareHipDeviceGetPCIBusIdWithLspci() {
|
||||
FILE *fpipe;
|
||||
bool testResult = false;
|
||||
|
||||
{
|
||||
// Check if lspci is installed, if not, don't proceed
|
||||
char const *cmd = "lspci --version";
|
||||
char *lspciCheck;
|
||||
char temp[20];
|
||||
fpipe = popen(cmd, "r");
|
||||
|
||||
if (fpipe == nullptr) {
|
||||
printf("Unable to create command file\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
lspciCheck = fgets(temp, 20, fpipe);
|
||||
pclose(fpipe);
|
||||
|
||||
if (!lspciCheck) {
|
||||
printf("lspci not found. Skipping the test\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
int deviceCount = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCount));
|
||||
HIPASSERT(deviceCount != 0);
|
||||
printf("No.of gpus in the system: %d\n", deviceCount);
|
||||
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
|
||||
|
||||
getPciBusId(deviceCount, hipDeviceList);
|
||||
|
||||
// Get lspci device list and compare with hip device list
|
||||
#if defined(__CUDA_ARCH__)
|
||||
char const *command = "lspci -D | grep controller | grep NVIDIA | "
|
||||
"cut -d ' ' -f 1";
|
||||
#else
|
||||
char const *command = "lspci -D | grep controller | grep AMD/ATI | "
|
||||
"cut -d ' ' -f 1";
|
||||
#endif
|
||||
fpipe = popen(command, "r");
|
||||
|
||||
if (fpipe == nullptr) {
|
||||
printf("Unable to create command file\n");
|
||||
return testResult;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
int deviceMatchCount = 0;
|
||||
|
||||
while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
|
||||
bool bMatchFound = false;
|
||||
for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
|
||||
if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
|
||||
deviceMatchCount++;
|
||||
bMatchFound = true;
|
||||
}
|
||||
}
|
||||
if (bMatchFound == false) {
|
||||
printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
|
||||
pclose(fpipe);
|
||||
|
||||
if (deviceMatchCount == deviceCount) {
|
||||
printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
|
||||
"matched for all gpus\n");
|
||||
testResult = true;
|
||||
} else {
|
||||
printf("Mismatch in number GPUs reported by HIP with lscpi\n");
|
||||
}
|
||||
return testResult;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
bool testResult = true;
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
if (p_tests & 0x1) {
|
||||
testResult &= comparePciBusIDWithHipDeviceGetAttribute();
|
||||
}
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
#ifdef __unix__
|
||||
testResult &= compareHipDeviceGetPCIBusIdWithLspci();
|
||||
#else
|
||||
printf("Detected non-linux OS. Skipping the test\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
if (testResult) {
|
||||
passed();
|
||||
} else {
|
||||
failed("one or more tests failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST_NAMED: %t hipSetGetDevice-invalidDevice
|
||||
* TEST_NAMED: %t hipSetGetDevice-allValidDevice
|
||||
* TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1
|
||||
|
||||
@@ -0,0 +1,227 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
#include <semaphore.h>
|
||||
#include <unistd.h>
|
||||
#include "test_common.h"
|
||||
|
||||
#ifdef __linux__
|
||||
sem_t *sem_ob1 = NULL, *sem_ob2 = NULL;
|
||||
typedef struct mem_handle {
|
||||
int device;
|
||||
hipIpcMemHandle_t memHandle;
|
||||
bool IfTestPassed;
|
||||
} hip_ipc_t;
|
||||
|
||||
class IpcMemHandleTest {
|
||||
public:
|
||||
bool InitFlag = true;
|
||||
hip_ipc_t *shrd_mem = NULL;
|
||||
pid_t pid;
|
||||
size_t N = 1024;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d = NULL, out = 0;
|
||||
int *A_h, *C_h;
|
||||
int Num_devices = 0, Data_mismatch, CanAccessPeer = 0;
|
||||
int *Ad1 = NULL, *Ad2 = NULL;
|
||||
IpcMemHandleTest();
|
||||
bool Test();
|
||||
~IpcMemHandleTest();
|
||||
};
|
||||
|
||||
|
||||
bool IpcMemHandleTest::Test() {
|
||||
if (InitFlag == false) {
|
||||
// Abort the test if the initialization fails
|
||||
printf("Resource initialization failed. Hence test skipped!");
|
||||
return false;
|
||||
}
|
||||
pid = fork();
|
||||
if (pid != 0) {
|
||||
// Parent process
|
||||
HIPCHECK(hipGetDeviceCount(&Num_devices));
|
||||
for (int i = 0; i < Num_devices; ++i) {
|
||||
if (shrd_mem->IfTestPassed == true) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMalloc(&A_d, Nbytes));
|
||||
HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle,
|
||||
A_d));
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
shrd_mem->device = i;
|
||||
if ((out=sem_post(sem_ob1)) == -1) {
|
||||
// Need to use inline function to release resources.
|
||||
shrd_mem->IfTestPassed = false;
|
||||
failed("sem_post() call failed in parent process.");
|
||||
}
|
||||
if ((out=sem_wait(sem_ob2)) == -1) {
|
||||
shrd_mem->IfTestPassed = false;
|
||||
failed("sem_wait() call failed in parent process.");
|
||||
}
|
||||
HIPCHECK(hipFree(A_d));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Child process
|
||||
HIPCHECK(hipGetDeviceCount(&Num_devices));
|
||||
for (int j = 0; j < Num_devices; ++j) {
|
||||
if ((out=sem_wait(sem_ob1)) == -1) {
|
||||
shrd_mem->IfTestPassed = false;
|
||||
printf("sem_wait() call failed in child process.");
|
||||
if ((out=sem_post(sem_ob2)) == -1) {
|
||||
printf("sem_post() call on sem_ob2 failed");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < Num_devices; ++i) {
|
||||
Data_mismatch = 0;
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMalloc(&Ad2, Nbytes));
|
||||
HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle,
|
||||
hipIpcMemLazyEnablePeerAccess));
|
||||
HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device));
|
||||
if (CanAccessPeer == 1) {
|
||||
HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice));
|
||||
HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice));
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (C_h[i] != 123)
|
||||
Data_mismatch++;
|
||||
}
|
||||
if (Data_mismatch != 0) {
|
||||
printf("Data mismatch found when data copied from Ipc memhandle");
|
||||
printf(" to Device: %d\n", i);
|
||||
shrd_mem->IfTestPassed = false;
|
||||
}
|
||||
memset(reinterpret_cast<void*>(C_h), 0, Nbytes);
|
||||
// Checking if the data obtained from Ipc shared memory is consistent
|
||||
HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (C_h[i] != 123)
|
||||
Data_mismatch++;
|
||||
}
|
||||
if (Data_mismatch != 0) {
|
||||
printf("Data mismatch found when data copied from Ipc memhandle");
|
||||
printf(" Host.\n");
|
||||
shrd_mem->IfTestPassed = false;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast<void*>(Ad1)));
|
||||
}
|
||||
HIPCHECK(hipFree(Ad2));
|
||||
if ((out=sem_post(sem_ob2)) == -1) {
|
||||
shrd_mem->IfTestPassed = false;
|
||||
printf("sem_post() call on sem_ob2 failed");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if ((out = sem_unlink("/my-sem-object1")) == -1) {
|
||||
printf("sem_unlink() call on /my-sem-object1 failed");
|
||||
}
|
||||
if ((out = sem_unlink("/my-sem-object2")) == -1) {
|
||||
printf("sem_unlink() call on /my-sem-object2 failed");
|
||||
}
|
||||
int status;
|
||||
waitpid(pid, &status, 0);
|
||||
if (shrd_mem->IfTestPassed == false) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
IpcMemHandleTest::IpcMemHandleTest() {
|
||||
std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*";
|
||||
int res = system(cmd_line.c_str());
|
||||
if (res == -1) {
|
||||
InitFlag = false;
|
||||
printf("System call to remove existing shared objects failed!");
|
||||
}
|
||||
int out;
|
||||
if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) ==
|
||||
SEM_FAILED) {
|
||||
InitFlag = false;
|
||||
printf("Initialization of 1st semaphore object failed");
|
||||
}
|
||||
if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) ==
|
||||
SEM_FAILED) {
|
||||
InitFlag = false;
|
||||
printf("Initialization of 2nd semaphore object failed");
|
||||
}
|
||||
|
||||
shrd_mem = reinterpret_cast<hip_ipc_t *>(mmap(NULL, sizeof(hip_ipc_t),
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_ANONYMOUS,
|
||||
0, 0));
|
||||
if (shrd_mem == NULL) {
|
||||
InitFlag = false;
|
||||
printf("mmap() call failed!");
|
||||
}
|
||||
shrd_mem->IfTestPassed = true;
|
||||
A_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
C_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
A_h[i] = 123;
|
||||
}
|
||||
}
|
||||
|
||||
IpcMemHandleTest::~IpcMemHandleTest() {
|
||||
munmap(shrd_mem, sizeof(hip_ipc_t));
|
||||
HIPCHECK(hipFree((A_d)));
|
||||
free(A_h);
|
||||
free(C_h);
|
||||
HIPCHECK(hipFree((Ad1)));
|
||||
HIPCHECK(hipFree((Ad2)));
|
||||
}
|
||||
#endif
|
||||
|
||||
int main() {
|
||||
bool IfTestPassed = true;
|
||||
// The following program spawns a child process and does the following
|
||||
// Parent iterate through each device, create memory -- create hipIpcMemhandle
|
||||
// stores the mem handle in mmaped memory, release the child using sem_post()
|
||||
// and wait for child to release itself(parent process)
|
||||
// child process:
|
||||
// Child process get the ipc mem handle using hipIpcOpenMemHandle
|
||||
// Iterate through all the available gpus and do Device to Device copies
|
||||
// and check for data consistencies and close the hipIpcCloseMemHandle
|
||||
// release the parent and wait for parent to release itself(child)
|
||||
#ifdef __linux__
|
||||
IpcMemHandleTest obj;
|
||||
IfTestPassed = obj.Test();
|
||||
#else
|
||||
printf("This is not a Linux platform. Hence Skipping the test!\n");
|
||||
IfTestPassed = true;
|
||||
#endif
|
||||
if (IfTestPassed == false) {
|
||||
failed("");
|
||||
}
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,487 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
Testcase Scenarios :
|
||||
|
||||
(TestCase 1)::
|
||||
1) Test hipMalloc() api passing zero size and confirming *ptr returning
|
||||
nullptr. Also pass nullptr to hipFree() api.
|
||||
2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate
|
||||
error is returned.
|
||||
3) Check for hipMalloc() error code, passing invalid/null pointer.
|
||||
|
||||
(TestCase 2)::
|
||||
4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation
|
||||
with adequate number of iterations and later test for kernel execution on
|
||||
default gpu.
|
||||
5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks
|
||||
keeping maximum number of iterations and then run kernel code on default
|
||||
gpu, perfom data validation.
|
||||
|
||||
(TestCase 3)::
|
||||
6) Check hipMalloc() api adaptability when app creates small chunks of memory
|
||||
continuously, stores it for later use and then frees it at later point
|
||||
of time.
|
||||
|
||||
(TestCase 4)::
|
||||
7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child
|
||||
processes, validate the results.
|
||||
|
||||
(TestCase 5)::
|
||||
8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple
|
||||
child processes. Validate buffers allocated after running kernel code.
|
||||
|
||||
(TestCase 6)::
|
||||
9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from
|
||||
multiple threads and regress the api.
|
||||
|
||||
(TestCases 2, 3, 4, 5, 6)::
|
||||
10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc()
|
||||
api. Check for any possible memory leaks.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* TEST_NAMED: %t hipMalloc_ArgValidation --tests 1
|
||||
* TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2
|
||||
* TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3
|
||||
* TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4
|
||||
* TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5
|
||||
* TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <atomic>
|
||||
|
||||
#include "test_common.h"
|
||||
|
||||
/* Max alloc/free iterations for bigger chunks */
|
||||
#define MAX_ALLOCFREE_BC (10000)
|
||||
|
||||
/* Buffer size for alloc/free cycles */
|
||||
#define BUFF_SIZE_AF (5*1024*1024)
|
||||
|
||||
/* Max alloc/free iterations for smaller chunks */
|
||||
#define MAX_ALLOCFREE_SC (5000000)
|
||||
|
||||
/* Max alloc and pool iterations (TBD) */
|
||||
#define MAX_ALLOCPOOL_ITER (2000000)
|
||||
|
||||
/**
|
||||
* Validates data consitency on supplied gpu
|
||||
*/
|
||||
bool validateMemoryOnGPU(int gpu) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
size_t prevAvl, prevTot, curAvl, curTot;
|
||||
bool TestPassed = true;
|
||||
|
||||
HIPCHECK(hipSetDevice(gpu));
|
||||
HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot));
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
|
||||
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
|
||||
printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
|
||||
} else {
|
||||
printf("%s : Validation FAILED for gpu %d from pid %d\n",
|
||||
__func__, gpu, getpid());
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIPCHECK(hipMemGetInfo(&curAvl, &curTot));
|
||||
|
||||
if ((prevAvl != curAvl) || (prevTot != curTot)) {
|
||||
printf("%s : Memory allocation mismatch observed."
|
||||
"Possible memory leak.", __func__);
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
return TestPassed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches Gpu device count
|
||||
*/
|
||||
void getDeviceCount(int *pdevCnt) {
|
||||
#ifdef __linux__
|
||||
int fd[2], val = 0;
|
||||
pid_t childpid;
|
||||
|
||||
// create pipe descriptors
|
||||
pipe(fd);
|
||||
|
||||
// disable visible_devices env from shell
|
||||
unsetenv("ROCR_VISIBLE_DEVICES");
|
||||
unsetenv("HIP_VISIBLE_DEVICES");
|
||||
|
||||
childpid = fork();
|
||||
|
||||
if (childpid > 0) { // Parent
|
||||
close(fd[1]);
|
||||
// parent will wait to read the device cnt
|
||||
read(fd[0], &val, sizeof(val));
|
||||
|
||||
// close the read-descriptor
|
||||
close(fd[0]);
|
||||
|
||||
// wait for child exit
|
||||
wait(NULL);
|
||||
|
||||
*pdevCnt = val;
|
||||
} else if (!childpid) { // Child
|
||||
int devCnt = 1;
|
||||
// writing only, no need for read-descriptor
|
||||
close(fd[0]);
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&devCnt));
|
||||
// send the value on the write-descriptor:
|
||||
write(fd[1], &devCnt, sizeof(devCnt));
|
||||
|
||||
// close the write descriptor:
|
||||
close(fd[1]);
|
||||
exit(0);
|
||||
} else { // failure
|
||||
*pdevCnt = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
#else
|
||||
HIPCHECK(hipGetDeviceCount(pdevCnt));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Regress memory allocation and free in loop
|
||||
*/
|
||||
bool regressAllocInLoop(int gpu) {
|
||||
bool TestPassed = true;
|
||||
size_t tot, avail, ptot, pavail;
|
||||
int i = 0;
|
||||
int *ptr;
|
||||
|
||||
HIPCHECK(hipSetDevice(gpu));
|
||||
|
||||
// Exercise allocation in loop with bigger chunks
|
||||
for (i = 0; i < MAX_ALLOCFREE_BC; i++) {
|
||||
size_t numBytes = BUFF_SIZE_AF;
|
||||
|
||||
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
|
||||
HIPCHECK(hipMalloc(&ptr, numBytes));
|
||||
HIPCHECK(hipMemGetInfo(&avail, &tot));
|
||||
|
||||
if (pavail-avail != numBytes) {
|
||||
printf("LoopAllocation : Memory allocation of %6.2fMB"
|
||||
"not matching with hipMemGetInfo - FAIL\n",
|
||||
numBytes/(1024.0*1024.0));
|
||||
TestPassed &= false;
|
||||
HIPCHECK(hipFree(ptr));
|
||||
break;
|
||||
}
|
||||
|
||||
HIPCHECK(hipFree(ptr));
|
||||
}
|
||||
|
||||
// Exercise allocation in loop with smaller chunks and max iters
|
||||
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
|
||||
|
||||
for (i = 0; i < MAX_ALLOCFREE_SC; i++) {
|
||||
size_t numBytes = 16;
|
||||
|
||||
HIPCHECK(hipMalloc(&ptr, numBytes));
|
||||
|
||||
HIPCHECK(hipFree(ptr));
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemGetInfo(&avail, &tot));
|
||||
|
||||
if ((pavail != avail) || (ptot != tot)) {
|
||||
printf("LoopAllocation : Memory allocation mismatch observed."
|
||||
"Possible memory leak.");
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
return TestPassed;
|
||||
}
|
||||
|
||||
/*
|
||||
* Thread func to regress alloc and check data consistency
|
||||
*/
|
||||
|
||||
std::atomic<bool> g_thTestPassed(true);
|
||||
|
||||
void threadFunc(int gpu) {
|
||||
g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu);
|
||||
g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu);
|
||||
|
||||
printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
if (p_tests == 1) { // Arg validation
|
||||
// Test hipMalloc for zero size
|
||||
bool TestPassed = true;
|
||||
int *ptr;
|
||||
|
||||
HIPCHECK(hipMalloc(&ptr, 0));
|
||||
|
||||
// ptr expected to be reset to null ptr
|
||||
if (ptr) {
|
||||
printf("ArgValidation : Failed in zero size test\n");
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
// Free null ptr
|
||||
HIPCHECK(hipFree(ptr));
|
||||
|
||||
// Test hipMalloc for invalid arguments
|
||||
hipError_t ret;
|
||||
|
||||
if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) {
|
||||
printf("ArgValidation : Inappropritate error value returned"
|
||||
" for invalid argument. Error: '%s'(%d)\n",
|
||||
hipGetErrorString(ret), ret);
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
// Test hipMalloc for Maximum value of size_t
|
||||
if ((ret = hipMalloc(&ptr, std::numeric_limits<std::size_t>::max()))
|
||||
!= hipErrorMemoryAllocation) {
|
||||
printf("ArgValidation : Invalid error returned for max size_t."
|
||||
" Error: '%s'(%d)\n", hipGetErrorString(ret), ret);
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
if (TestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMalloc ArgumentValidation Failure!");
|
||||
}
|
||||
|
||||
} else if (p_tests == 2) { // Loop Regression Alloc/Free Cycle
|
||||
bool TestPassed = true;
|
||||
|
||||
TestPassed &= regressAllocInLoop(0);
|
||||
TestPassed &= validateMemoryOnGPU(0);
|
||||
|
||||
if (TestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!");
|
||||
}
|
||||
|
||||
} else if (p_tests == 3) { // Loop Regression Alloc and Pool
|
||||
size_t avail, tot, pavail, ptot;
|
||||
bool TestPassed = true;
|
||||
hipError_t err;
|
||||
int *ptr;
|
||||
|
||||
std::vector<int *> ptrlist;
|
||||
|
||||
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
|
||||
|
||||
// Allocate small chunks of memory million times
|
||||
for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) { // Iterations TBD
|
||||
if ((err = hipMalloc(&ptr, 10)) != hipSuccess) {
|
||||
HIPCHECK(hipMemGetInfo(&avail, &tot));
|
||||
|
||||
printf("Loop regression pool allocation failure. "
|
||||
"Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n",
|
||||
tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err));
|
||||
|
||||
TestPassed &= false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Store pointers allocated to emulate memory pool of app
|
||||
ptrlist.push_back(ptr);
|
||||
}
|
||||
|
||||
// Free ptrs at later point of time
|
||||
for ( auto &t : ptrlist ) {
|
||||
HIPCHECK(hipFree(t));
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemGetInfo(&avail, &tot));
|
||||
|
||||
TestPassed &= validateMemoryOnGPU(0);
|
||||
|
||||
if ((pavail != avail) || (ptot != tot)) {
|
||||
printf("%s : Memory allocation mismatch observed. Possible memory leak.",
|
||||
__func__);
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
if (TestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMalloc_LoopRegression_AllocPool failure!");
|
||||
}
|
||||
|
||||
} else if (p_tests == 4) {
|
||||
bool TestPassed = true;
|
||||
|
||||
#ifdef __linux__
|
||||
// Parallel execution of parent and child on gpu0
|
||||
int pid;
|
||||
|
||||
if ((pid = fork()) < 0) {
|
||||
printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid);
|
||||
TestPassed &= false;
|
||||
|
||||
} else if (!pid) { // Child process
|
||||
bool TestPassedChild = true;
|
||||
|
||||
TestPassedChild = validateMemoryOnGPU(0);
|
||||
|
||||
if (TestPassedChild) {
|
||||
exit(0); // child exit with success status
|
||||
} else {
|
||||
printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid());
|
||||
exit(1); // child exit with failure status
|
||||
}
|
||||
|
||||
} else { // Parent process
|
||||
int exitStatus;
|
||||
TestPassed = validateMemoryOnGPU(0);
|
||||
|
||||
pid = wait(&exitStatus);
|
||||
if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) )
|
||||
TestPassed &= false;
|
||||
}
|
||||
#else
|
||||
printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n");
|
||||
#endif
|
||||
|
||||
// TC scenarios specific to linux
|
||||
// are treated as pass in windows.
|
||||
if (TestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMallocChild_Concurrency_DefaultGpu Failed!");
|
||||
}
|
||||
|
||||
} else if (p_tests == 5) {
|
||||
bool TestPassed = true;
|
||||
#ifdef __linux__
|
||||
// Parallel execution on multiple gpus from different child processes
|
||||
int devCnt = 1, pid = 0, cumStatus = 0;
|
||||
|
||||
// Get GPU count
|
||||
getDeviceCount(&devCnt);
|
||||
|
||||
// Spawn child for each GPU
|
||||
for (int gpu = 0; gpu < devCnt; gpu++) {
|
||||
if ((pid = fork()) < 0) {
|
||||
printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid);
|
||||
failed("Test Failed!");
|
||||
|
||||
} else if (!pid) { // Child process
|
||||
bool TestPassedChild = true;
|
||||
TestPassedChild = validateMemoryOnGPU(gpu);
|
||||
|
||||
if (TestPassedChild) {
|
||||
exit(0); // child exit with success status
|
||||
} else {
|
||||
printf("Child_Concurrency_MultiGpu : childpid %d failed\n",
|
||||
getpid());
|
||||
exit(1); // child exit with failure status
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parent shall wait for child to complete
|
||||
for (int i = 0; i < devCnt; i++) {
|
||||
int pidwait = 0, exitStatus;
|
||||
pidwait = wait(&exitStatus);
|
||||
|
||||
if (pidwait < 0) {
|
||||
TestPassed &= false;
|
||||
break;
|
||||
}
|
||||
|
||||
cumStatus |= WEXITSTATUS(exitStatus);
|
||||
}
|
||||
|
||||
// Cummulative status of all child
|
||||
if (cumStatus) {
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
#else
|
||||
printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n");
|
||||
#endif
|
||||
|
||||
|
||||
// TC scenarios specific to linux
|
||||
// are treated as pass in windows.
|
||||
if (TestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMallocChild_Concurrency_MultiGpu Failed!");
|
||||
}
|
||||
|
||||
} else if (p_tests == 6) { // Multithreaded multiple gpu execution
|
||||
std::vector<std::thread> threadlist;
|
||||
int devCnt = 1;
|
||||
|
||||
// Get GPU count
|
||||
getDeviceCount(&devCnt);
|
||||
|
||||
|
||||
for (int i = 0; i < devCnt; i++) {
|
||||
threadlist.push_back(std::thread(threadFunc, i));
|
||||
}
|
||||
|
||||
for (auto &t : threadlist) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
if (g_thTestPassed) {
|
||||
passed();
|
||||
} else {
|
||||
failed("hipMalloc_MultiThreaded_MultiGpu Failed!");
|
||||
}
|
||||
} else {
|
||||
failed("Didnt receive any valid option. Try options 1 to 6\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,423 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Test 6 is disabled */
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* TEST_NAMED: %t hipMallocManaged1 --tests 1
|
||||
* TEST_NAMED: %t hipMallocManaged2 --tests 2
|
||||
* TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3
|
||||
* TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4
|
||||
* TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <atomic>
|
||||
#include "test_common.h"
|
||||
#define N 1048576 // equals to (1024*1024)
|
||||
#define INIT_VAL 123
|
||||
|
||||
/*
|
||||
* Kernel function to perform addition operation.
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
|
||||
Ad2[i] = Ad1[i] + Ad1[i];
|
||||
}
|
||||
}
|
||||
|
||||
// The following Test case tests the following scenario:
|
||||
// A large chunk of hipMallocManaged() memory(Hmm) is created
|
||||
// Equal parts of Hmm is accessed on available gpus and
|
||||
// kernel is launched on acessed chunk of hmm memory
|
||||
// and checks if there are any inconsistencies or access issues
|
||||
bool MultiChunkMultiDevice(int NumDevices) {
|
||||
std::atomic<int> DataMismatch{0};
|
||||
bool IfTestPassed = true;
|
||||
int Counter = 0;
|
||||
unsigned int NUM_ELMS = (1024 * 1024);
|
||||
float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS];
|
||||
hipStream_t stream[NumDevices];
|
||||
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
|
||||
HIPCHECK(hipSetDevice(Oloop));
|
||||
HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float)));
|
||||
HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float)));
|
||||
HIPCHECK(hipStreamCreate(&stream[Oloop]));
|
||||
}
|
||||
HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float))));
|
||||
for (int i = 0; i < NumDevices; ++i) {
|
||||
for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
|
||||
Hmm[Counter] = INIT_VAL + i;
|
||||
}
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (NUM_ELMS + 255)/256;
|
||||
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
|
||||
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
|
||||
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < NumDevices; ++m) {
|
||||
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
for (int n = 0; n < NUM_ELMS; ++n) {
|
||||
if (Ah[n] != ((INIT_VAL + m) * 2)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
memset(reinterpret_cast<void*>(Ah), 0, NUM_ELMS * sizeof(float));
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
printf("MultiChunkMultiDevice: Mismatch observed!\n");
|
||||
IfTestPassed = false;
|
||||
}
|
||||
for (int i = 0; i < NumDevices; ++i) {
|
||||
HIPCHECK(hipFree(Ad[i]));
|
||||
HIPCHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
HIPCHECK(hipFree(Hmm));
|
||||
free(Ah);
|
||||
return IfTestPassed;
|
||||
}
|
||||
|
||||
// The following Test case tests the following scenario:
|
||||
// A large chunk of hipMallocManaged() memory(Hmm) is created
|
||||
// Equal parts of Hmm is accessed and
|
||||
// kernel is launched on acessed chunk of hmm memory
|
||||
// and checks if there are any inconsistencies or access issues
|
||||
|
||||
bool MultiChunkSingleDevice(int NumDevices) {
|
||||
std::atomic<int> DataMismatch{0};
|
||||
int Chunks = 4, Counter = 0;
|
||||
bool IfTestPassed = true;
|
||||
unsigned int NUM_ELMS = (1024 * 1024);
|
||||
float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS];
|
||||
hipStream_t stream[Chunks];
|
||||
for (int i = 0; i < Chunks; ++i) {
|
||||
HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float)));
|
||||
HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float)));
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float))));
|
||||
for (int i = 0; i < Chunks; ++i) {
|
||||
for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
|
||||
Hmm[Counter] = (INIT_VAL + i);
|
||||
}
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (NUM_ELMS + 255)/256;
|
||||
for (int k = 0; k < Chunks; ++k) {
|
||||
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
|
||||
(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < Chunks; ++m) {
|
||||
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
for (int n = 0; n < NUM_ELMS; ++n) {
|
||||
if (Ah[n] != ((INIT_VAL + m) * 2)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
printf("MultiChunkSingleDevice: Mismatch observed!\n");
|
||||
IfTestPassed = false;
|
||||
}
|
||||
for (int i = 0; i < Chunks; ++i) {
|
||||
HIPCHECK(hipFree(Ad[i]));
|
||||
HIPCHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
HIPCHECK(hipFree(Hmm));
|
||||
free(Ah);
|
||||
return IfTestPassed;
|
||||
}
|
||||
|
||||
// The following tests oversubscription hipMallocManaged() api
|
||||
// Currently disabled.
|
||||
bool TestOversubscriptionMallocManaged(int NumDevices) {
|
||||
bool IfTestPassed = true;
|
||||
hipError_t err;
|
||||
void *A = NULL;
|
||||
size_t total = 0, free = 0;
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
// ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook
|
||||
// into how out of memory can be tested.
|
||||
// Demanding more mem size than available
|
||||
err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal);
|
||||
if (hipErrorOutOfMemory != err) {
|
||||
printf("hipMallocManaged: Returned %s for size value > device memory\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
return IfTestPassed;
|
||||
}
|
||||
|
||||
// The following test does negative testing of hipMallocManaged() api
|
||||
// by passing invalid values and check if the behavior is as expected
|
||||
bool NegativeTestsMallocManaged(int NumDevices) {
|
||||
bool IfTestPassed = true;
|
||||
hipError_t err;
|
||||
void *A = NULL;
|
||||
size_t total = 0, free = 0;
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s when devPtr is null\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
err = hipMallocManaged(&A, 0, hipMemAttachGlobal);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s when size is 0\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
err = hipMallocManaged(NULL, 0, hipMemAttachGlobal);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
// The flag hipMemAttachHost is currently not supported therefore
|
||||
// api should return "hipErrorInvalidValue" for now
|
||||
err = hipMallocManaged(&A, 1024, hipMemAttachHost);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
||||
err = hipMallocManaged(NULL, 0, 0);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s when params are null, 0, 0\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
err = hipMallocManaged(&A, 1024, 145);
|
||||
if (hipErrorInvalidValue != err) {
|
||||
printf("hipMallocManaged: Returned %s when flag param is numerical 145\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
err = hipMallocManaged(&A, -10, hipMemAttachGlobal);
|
||||
if (hipErrorOutOfMemory != err) {
|
||||
printf("hipMallocManaged: Returned %s for negative size value.\n",
|
||||
hipGetErrorString(err));
|
||||
IfTestPassed = false;
|
||||
}
|
||||
|
||||
return IfTestPassed;
|
||||
}
|
||||
|
||||
|
||||
// Allocate two pointers using hipMallocManaged(), initialize,
|
||||
// then launch kernel using these pointers directly and
|
||||
// later validate the content without using any Memcpy.
|
||||
template <typename T>
|
||||
bool TestMallocManaged2(int NumDevices) {
|
||||
bool IfTestPassed = true;
|
||||
T *Hmm1 = NULL, *Hmm2 = NULL;
|
||||
|
||||
for (int i = 0; i < NumDevices; ++i) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
std::atomic<int> DataMismatch{0};
|
||||
HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T)));
|
||||
HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T)));
|
||||
for (int m = 0; m < N; ++m) {
|
||||
Hmm1[m] = m;
|
||||
Hmm2[m] = 0;
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (N + 255)/256;
|
||||
// Kernel launch
|
||||
vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int v = 0; v < N; ++v) {
|
||||
if (Hmm2[v] != (v + v)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
IfTestPassed = false;
|
||||
}
|
||||
HIPCHECK(hipFree(Hmm1));
|
||||
HIPCHECK(hipFree(Hmm2));
|
||||
}
|
||||
return IfTestPassed;
|
||||
}
|
||||
|
||||
// In the following test, a memory is created using hipMallocManaged() by
|
||||
// setting a device and verified if it is accessible when the context is set
|
||||
// to all other devices. This include verification and Device two Device
|
||||
// transfers and kernel launch o discover if there any access issues.
|
||||
|
||||
template <typename T>
|
||||
bool TestMallocManaged1(int NumDevices) {
|
||||
std::atomic<unsigned int> DataMismatch;
|
||||
bool TestPassed = true;
|
||||
T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL;
|
||||
|
||||
for (int i =0; i < N; ++i) {
|
||||
Ah1[i] = INIT_VAL;
|
||||
Ah2[i] = 0;
|
||||
}
|
||||
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
|
||||
DataMismatch = 0;
|
||||
HIPCHECK(hipSetDevice(Oloop));
|
||||
HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
|
||||
for (int Iloop = 0; Iloop < NumDevices; ++Iloop) {
|
||||
HIPCHECK(hipSetDevice(Iloop));
|
||||
HIPCHECK(hipMalloc(&Ad, N * sizeof(T)));
|
||||
// Copy data from host to hipMallocMananged memory and verify
|
||||
HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice));
|
||||
for (int v = 0; v < N; ++v) {
|
||||
if (Hmm[v] != INIT_VAL) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
printf("Mismatch is observed with host data at device %d", Iloop);
|
||||
printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
|
||||
TestPassed = false;
|
||||
DataMismatch = 0;
|
||||
}
|
||||
// Executing D2D transfer with hipMallocManaged memory and verify
|
||||
HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice));
|
||||
HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (Ah2[k] != INIT_VAL) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
printf("Mismatch is observed with D2D transfer at device %d\n", Iloop);
|
||||
printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
|
||||
TestPassed = false;
|
||||
DataMismatch = 0;
|
||||
}
|
||||
HIPCHECK(hipMemset(Ad, 0, N * sizeof(T)));
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (N + 255)/256;
|
||||
// Launching the kernel to check if there is any access issue with
|
||||
// hipMallocManaged memory and local device's memory
|
||||
vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
|
||||
hipDeviceSynchronize();
|
||||
HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
|
||||
for (int m = 0; m < N; ++m) {
|
||||
if (Ah2[m] != 246) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch.load() != 0) {
|
||||
printf("Data Mismatch observed after kernel lch device %d\n", Iloop);
|
||||
TestPassed = false;
|
||||
DataMismatch = 0;
|
||||
}
|
||||
HIPCHECK(hipFree(Ad));
|
||||
}
|
||||
HIPCHECK(hipFree(Hmm));
|
||||
}
|
||||
free(Ah1);
|
||||
free(Ah2);
|
||||
return TestPassed;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
if ((p_tests <= 0) || (p_tests > 5)) {
|
||||
failed("Valid arguments are from 1 to 5");
|
||||
}
|
||||
|
||||
int NumDevices = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&NumDevices));
|
||||
bool TestStatus = true, OverAllStatus = true;
|
||||
if (p_tests == 1) {
|
||||
TestStatus = TestMallocManaged1<float>(NumDevices);
|
||||
if (!TestStatus) {
|
||||
printf("Test Failed with float datatype.\n");
|
||||
OverAllStatus = false;
|
||||
}
|
||||
TestStatus = TestMallocManaged1<int>(NumDevices);
|
||||
if (!TestStatus) {
|
||||
printf("Test Failed with int datatype.\n");
|
||||
OverAllStatus = false;
|
||||
}
|
||||
TestStatus = TestMallocManaged1<unsigned char>(NumDevices);
|
||||
if (!TestStatus) {
|
||||
printf("Test Failed with unsigned char datatype.\n");
|
||||
OverAllStatus = false;
|
||||
}
|
||||
TestStatus = TestMallocManaged1<double>(NumDevices);
|
||||
if (!TestStatus) {
|
||||
printf("Test Failed with double datatype.\n");
|
||||
OverAllStatus = false;
|
||||
}
|
||||
if (!OverAllStatus) {
|
||||
failed("");
|
||||
}
|
||||
}
|
||||
if (p_tests == 2) {
|
||||
TestStatus = TestMallocManaged2<float>(NumDevices);
|
||||
if (!TestStatus) {
|
||||
failed("Test Failed with float datatype.");
|
||||
}
|
||||
}
|
||||
if (p_tests == 3) {
|
||||
TestStatus = NegativeTestsMallocManaged(NumDevices);
|
||||
if (!TestStatus) {
|
||||
failed("Negative Tests with hipMallocManaged() failed!.");
|
||||
}
|
||||
}
|
||||
if (p_tests == 4) {
|
||||
TestStatus = MultiChunkSingleDevice(NumDevices);
|
||||
if (!TestStatus) {
|
||||
failed("hipMallocManaged: MultiChunkSingleDevice test failed!");
|
||||
}
|
||||
}
|
||||
if (p_tests == 5) {
|
||||
TestStatus = MultiChunkMultiDevice(NumDevices);
|
||||
if (!TestStatus) {
|
||||
failed("hipMallocManaged: MultiChunkMultiDevice test failed!");
|
||||
}
|
||||
}
|
||||
if (p_tests == 6) {
|
||||
TestStatus = TestOversubscriptionMallocManaged(NumDevices);
|
||||
if (!TestStatus) {
|
||||
failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!");
|
||||
}
|
||||
}
|
||||
passed();
|
||||
}
|
||||
@@ -75,6 +75,9 @@ int main() {
|
||||
HIPCHECK(hipFree(Z_d));
|
||||
} else {
|
||||
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,9 @@ int main() {
|
||||
HIPCHECK(hipFree(Z_d));
|
||||
} else {
|
||||
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +77,9 @@ int main() {
|
||||
HIPCHECK(hipFree(Z_d));
|
||||
} else {
|
||||
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
}
|
||||
}
|
||||
}
|
||||
passed();
|
||||
|
||||
@@ -83,6 +83,9 @@ int main() {
|
||||
HIPCHECK(hipFree(Z_d));
|
||||
} else {
|
||||
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
|
||||
if (hip_skip_tests_enabled()) {
|
||||
return hip_skip_retcode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
// Test for hipMemset2D functionality for different width and height values
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST_NAMED: %t hipMemset2D-basic
|
||||
* TEST_NAMED: %t hipMemset2D-dim1 --width2D 10 --height2D 10 --memsetWidth 4 --memsetHeight 4
|
||||
* TEST_NAMED: %t hipMemset2D-dim2 --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 40
|
||||
|
||||
+1
-1
@@ -21,7 +21,7 @@
|
||||
// and also launch hipMemcpyAsync() api on the same stream. This test case is simulate the scenario
|
||||
// reported in SWDEV-181598.
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
// and also launch hipMemcpyAsync() api. This test case is simulate the scenario
|
||||
// reported in SWDEV-181598.
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include "test_common.h"
|
||||
|
||||
int main() {
|
||||
hipSharedMemConfig_t config;
|
||||
HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL));
|
||||
HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config));
|
||||
}
|
||||
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user