Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next""

This reverts commit 28b17d3dbd.

Reason for revert: <INSERT REASONING HERE>

Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9


[ROCm/hip commit: ad2d55c144]
Этот коммит содержится в:
Vladislav Sytchenko
2020-10-05 13:20:58 -04:00
родитель 28b17d3dbd
Коммит e4caaa2a77
111 изменённых файлов: 6800 добавлений и 753 удалений
+45 -7
Просмотреть файл
@@ -8,10 +8,15 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib (
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH)
set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
if(NOT DEFINED __HIP_ENABLE_PCH)
set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
endif()
if(${__HIP_ENABLE_PCH})
set(_pchStatus 1)
else()
set(_pchStatus 0)
endif()
#############################
# Options
#############################
@@ -80,8 +85,8 @@ if(GIT_FOUND)
set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH})
if(DEFINED ENV{ROCM_BUILD_ID})
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH})
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION})
else()
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH})
endif()
@@ -90,6 +95,36 @@ else()
set(HIP_PACKAGING_VERSION_PATCH "0")
endif()
## Debian package specific variables
if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
else()
set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
endif()
message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
## RPM package specific variables
if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
else()
set ( CPACK_RPM_PACKAGE_RELEASE "local" )
endif()
## 'dist' breaks manual builds on debian systems due to empty Provides
execute_process( COMMAND rpm --eval %{?dist}
RESULT_VARIABLE PROC_RESULT
OUTPUT_VARIABLE EVAL_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE )
if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
endif()
message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
add_to_config(_versionInfo HIP_VERSION_MAJOR)
add_to_config(_versionInfo HIP_VERSION_MINOR)
add_to_config(_versionInfo HIP_VERSION_PATCH)
@@ -102,7 +137,6 @@ else ()
set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
endif ()
set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
if (DEFINED ENV{ROCM_RPATH})
set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
@@ -456,6 +490,7 @@ set(_versionInfoHeader
#define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
#define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}
#define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
#define __HIP_HAS_GET_PCH ${_pchStatus}\n
#endif\n
")
file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
@@ -669,8 +704,11 @@ endif()
# Testing steps
#############################
# Target: test
set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX})
set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
if(HIP_PLATFORM STREQUAL "nvcc")
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
endif()
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
if(${RUN_HIT} EQUAL 0)
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
@@ -713,7 +751,7 @@ endif()
#############################
# Target: clang
if(HIP_HIPCC_EXECUTABLE)
add_custom_target(analyze
add_custom_target(analyze
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
WORKING_DIRECTORY ${HIP_SRC_PATH})
if(CPPCHECK_EXE)
+48 -56
Просмотреть файл
@@ -1,15 +1,15 @@
# Contributor Guidelines
# Contributor Guidelines
## Make Tips
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example
```
cmake .. -DCMAKE_INSTALL_PREFIX=..
make install
export HIP_PATH=
export HIP_PATH=
```
After making HIP, don't forget the "make install" step !
@@ -21,118 +21,110 @@ After making HIP, don't forget the "make install" step !
- Add a translation to the hipify-clang tool ; many examples abound.
- For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
- Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h.
- These are typically headers
- Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
- Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve
calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for
hipStream_t).
- These are typically headers
- Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
- Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
#### Testing HCC version
In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time
if the current HCC compiler is sufficiently new enough to support the desired feature. The `__hcc_workweek__` compiler
define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0)
(ie 15403, 16014, etc).
The granularity is one day, so __hcc_workweek__ can only be used to distinguish compiler builds that are at least one day apart.
## Check HIP-Clang version
In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
HIP runtime version
```
#ifdef __hcc_workweek_ > 16014
// use cool new HCC feature here
#endif
> cat /opt/rocm/hip/bin/.hipVersion
# Auto-generated by cmake
HIP_VERSION_MAJOR=3
HIP_VERSION_MINOR=9
HIP_VERSION_PATCH=20345-519ef3f2
```
Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4
HIP-Clang compiler version
```
> /opt/rocm/hcc/bin/hcc -v
HCC clang version 3.5.0 (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn)
$ /opt/rocm/llvm/bin/clang -v
clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
Target: x86_64-unknown-linux-gnu
Thread model: posix
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
InstalledDir: /opt/rocm/llvm/bin
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Candidate multilib: x32;@mx32
Selected multilib: .;@m64
```
The unix `date` command can print the HCC-format work-week for a specific date , ie:
```
> date --utc +%y%U%w -d 2015-11-09
15451
```
## Unit Testing Environment
HIP includes unit tests in the tests/src directory.
HIP includes unit tests in the tests/src directory.
When adding a new HIP feature, add a new unit test as well.
See [tests/README.md](README.md) for more information.
## Development Flow
It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source.
Directed tests provide a great place to develop new features alongside the associated test.
Directed tests provide a great place to develop new features alongside the associated test.
For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
- #1. Compile, link, and install HCC. See [Installation](README.md#Installation) notes.
- #2. Relink the target application to include changes in the libhip_hcc.a file.
- #1. Compile, link, and install HIP/ROCclr. See [Installation](README.md#Installation) notes.
- #2. Relink the target application to include changes in HIP runtime file.
## Environment Variables
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
- **HCC_HOME** : Path to HCC compiler. Default /opt/rocm/hcc.
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms. Default /opt/rocm/rocclr.
- **HSA_PATH** : Path to HSA include, lib. Default /opt/rocm/hsa.
- **CUDA_PATH* : On nvcc system, this points to root of CUDA installation.
### Contribution guidelines ###
## Contribution guidelines ##
Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs.
The HIP interface is designed to be very familiar for CUDA programmers.
Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
## Coding Guidelines (in brief)
### Coding Guidelines (in brief)
- Code Indentation:
- Tabs should be expanded to spaces.
- Use 4 spaces indentation.
- Capitalization and Naming
- Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator.
- Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator.
This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
- Member variables should begin with a leading "_". This allows them to be easily distinguished from other variables or functions.
- {} placement
- For functions, the opening { should be placed on a new line.
- For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else. Example
'''
if (foo) {
doFoo()
} else {
doFoo()
} else {
doFooElse();
}
'''
- namespace should be on same line as { and separated by a space.
- Single-line if statement should still use {/} pair (even though C++ does not require).
- Miscellaneous
- All references in function parameter lists should be const.
- All references in function parameter lists should be const.
- "ihip" = internal hip structures. These should not be exposed through the HIP API.
- Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs.
- FIXME refers to a short-term bug that needs to be addressed.
- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized,
and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match
those of the parent function.
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code
those of the parent function.
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code
is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly.
- All HIP environment variables should begin with the keyword HIP_
Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform .
HIPCC or other tools may support additional environment variables which should follow the above convention.
HIPCC or other tools may support additional environment variables which should follow the above convention.
#### Presubmit Testing:
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
### Presubmit Testing:
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
Ensure pass results match starting point:
```shell
@@ -141,13 +133,13 @@ Ensure pass results match starting point:
```
#### Checkin messages
### Checkin messages
Follow existing best practice for writing a good Git commit message. Some tips:
http://chris.beams.io/posts/git-commit/
https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
In particular :
- Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
In particular :
- Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
- Subject should summarize the commit. Do not end subject with a period. Use a blank line
after the subject.
+4 -5
Просмотреть файл
@@ -1,8 +1,7 @@
#!/bin/bash
#set -x
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
LLVM_DIR="$1/../../../"
tmp=/tmp/hip_pch.$$
mkdir -p $tmp
@@ -47,12 +46,12 @@ __hip_pch_size:
.long __hip_pch_size - __hip_pch
EOF
$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
cat $tmp/hip_macros.h >> $tmp/pch.cui
$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
rm -rf $tmp
-36
Просмотреть файл
@@ -1,36 +0,0 @@
#!/bin/bash
#set -x
cat >/tmp/hip_macros.h <<EOF
#define __device__ __attribute__((device))
#define __host__ __attribute__((host))
#define __global__ __attribute__((global))
#define __constant__ __attribute__((constant))
#define __shared__ __attribute__((shared))
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...) \
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
// Macro to replace extern __shared__ declarations
// to local variable definitions
#define HIP_DYNAMIC_SHARED(type, var) \
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
EOF
cat >/tmp/hip_pch.h <<EOF
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
EOF
/opt/rocm/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip /tmp/hip_pch.h -E >/tmp/pch.cui
cat /tmp/hip_macros.h >> /tmp/pch.cui
/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - </tmp/pch.cui
+2 -1
Просмотреть файл
@@ -803,7 +803,8 @@ if ($needHipHcc) {
if ($linkType eq 0) {
substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libamdhip64.a " ;
} else {
substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so ";
#Currently in ROCm some of libraries are in lib64 and rest are in lib folder in centos.
substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64 $HIP_LIB_PATH/libamdhip64.so ";
}
}
+1 -1
Просмотреть файл
@@ -247,4 +247,4 @@ The workaround is to explicitly add the keyword of "static" before any functions
Product of block.x, block.y, and block.z should be less than 1024.
### Are __shfl_*_sync functions supported on HIP platform?
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
+15 -13
Просмотреть файл
@@ -54,7 +54,18 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
# set a default path for ROCM_PATH
if(NOT DEFINED ROCM_PATH)
set(ROCM_PATH /opt/rocm)
endif()
#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()
if(HIP_COMPILER STREQUAL "clang")
set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
if(NOT HIP_CXX_COMPILER)
set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
endif()
@@ -62,16 +73,12 @@ if(HIP_COMPILER STREQUAL "clang")
execute_process(COMMAND ${HIP_CXX_COMPILER} --version
OUTPUT_STRIP_TRAILING_WHITESPACE
OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[\t\r\n][\t\r\n]*([^\t\r\n])")
set(HIP_CLANG_ROOT ${CMAKE_MATCH_1})
else()
set(HIP_CLANG_ROOT /opt/rocm/llvm)
if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
endif()
elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" PATH)
get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" PATH)
else()
set(HIP_CLANG_ROOT /opt/rocm/llvm)
get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" DIRECTORY)
get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" DIRECTORY)
endif()
file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
find_path(HIP_CLANG_INCLUDE_PATH stddef.h
@@ -89,11 +96,6 @@ find_dependency(amd_comgr)
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
if( DEFINED ENV{ROCM_PATH} )
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()
#Using find_dependecy to locate the dependency for the packagaes
#This makes the cmake generated file xxxx-targets to supply the linker libraries
# without worrying other transitive dependencies
+136 -2
Просмотреть файл
@@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize)
}
__device__
inline
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
#endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
__device__
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
@@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
@@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
return tmp1;
}
__device__
inline
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
@@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
}
__device__
inline
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
@@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
}
__device__
inline
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
#endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
+16
Просмотреть файл
@@ -487,6 +487,22 @@ struct __HIP_Coordinates {
#endif
};
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
inline
__device__
+8 -5
Просмотреть файл
@@ -345,13 +345,16 @@ typedef struct hipLaunchParams_t {
hipStream_t stream; ///< Stream identifier
} hipLaunchParams;
// Pre-Compiled header for online compilation
#ifdef ENABLE_HIP_PCH
extern const char* __hip_pch;
extern unsigned __hip_pch_size;
void __hipGetPCH(const char** pch, unsigned int*size);
#if __HIP_HAS_GET_PCH
/**
* Internal use only. This API may change in the future
* Pre-Compiled header for online compilation
*
*/
void __hipGetPCH(const char** pch, unsigned int*size);
#endif
// Doxygen end group GlobalDefs
/** @} */
+6 -3
Просмотреть файл
@@ -28,14 +28,17 @@ THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
#include <hip/hip_version.h>
#include <hip/hip_common.h>
#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
#if __cplusplus
#if __cplusplus && defined(__clang__) && defined(__HIP__)
#include <hip/hcc_detail/hip_cooperative_groups.h>
#endif
#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
#include <cooperative_groups.h>
#include <hip/nvcc_detail/hip_cooperative_groups.h>
#else
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
#endif
+1
Просмотреть файл
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include <string.h> // for getDeviceProp
#include <hip/hip_version.h>
#include <hip/hip_common.h>
enum {
+12
Просмотреть файл
@@ -0,0 +1,12 @@
#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
// Include CUDA headers
#include <cuda_runtime.h>
#include <cooperative_groups.h>
// Include HIP wrapper headers around CUDA
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+2 -2
Просмотреть файл
@@ -104,13 +104,13 @@ typedef int hipLaunchParm;
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
#ifdef __HIP_DEVICE_COMPILE__
#define abort() \
#define abort_() \
{ asm("trap;"); }
#undef assert
#define assert(COND) \
{ \
if (!COND) { \
abort(); \
abort_(); \
} \
}
#endif
+1
Просмотреть файл
@@ -26,6 +26,7 @@ THE SOFTWARE.
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <cuda_fp16.h>
#ifdef __cplusplus
extern "C" {
+2
Просмотреть файл
@@ -20,6 +20,7 @@ target_include_directories(lpl
target_compile_options(lpl PUBLIC -Wall)
target_link_libraries(lpl PUBLIC pthread)
add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl)
install(TARGETS lpl RUNTIME DESTINATION bin)
#-------------------------------------LPL--------------------------------------#
@@ -43,6 +44,7 @@ find_package(hsa-runtime64 REQUIRED CONFIG
target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 )
target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)
add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca)
install(TARGETS ca RUNTIME DESTINATION bin)
#-------------------------------------CA---------------------------------------#
+6 -5
Просмотреть файл
@@ -21,22 +21,23 @@ set(CPACK_PACKAGE_NAME "hip-base")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
+8 -7
Просмотреть файл
@@ -24,25 +24,26 @@ set(CPACK_PACKAGE_NAME "hip-doc")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}")
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+9 -4
Просмотреть файл
@@ -28,24 +28,29 @@ endif()
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+7 -6
Просмотреть файл
@@ -10,28 +10,29 @@ set(CPACK_PACKAGE_NAME "hip-nvcc")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5")
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+7 -6
Просмотреть файл
@@ -33,27 +33,28 @@ set(HCC_PACKAGE_NAME "rocclr")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), comgr (>= 1.1), llvm-amdgpu")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu")
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, comgr >= 1.1, llvm-amdgpu")
set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
set(CPACK_SOURCE_GENERATOR "TGZ")
+7 -6
Просмотреть файл
@@ -12,25 +12,26 @@ set(CPACK_PACKAGE_NAME "hip-samples")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}")
set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+25 -5
Просмотреть файл
@@ -96,6 +96,14 @@ find_package(amd_comgr REQUIRED CONFIG
message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
find_package(LLVM REQUIRED CONFIG
PATHS
/opt/rocm/llvm
PATH_SUFFIXES
lib/cmake/llvm)
message(STATUS "llvm found at ${LLVM_DIR}.")
add_library(hip64 OBJECT
hip_context.cpp
hip_code_object.cpp
@@ -148,10 +156,9 @@ endif()
# Short-Term solution for pre-compiled headers for online compilation
# Enable pre compiled header
if(${ENABLE_HIP_PCH})
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh")
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh")
add_definitions(-DENABLE_HIP_PCH)
if(${__HIP_ENABLE_PCH})
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}")
add_definitions(-D__HIP_ENABLE_PCH)
endif()
# Enable profiling API
@@ -216,7 +223,7 @@ add_library(device INTERFACE)
target_link_libraries(device INTERFACE host)
# Short-Term solution for pre-compiled headers for online compilation
if(${ENABLE_HIP_PCH})
if(${__HIP_ENABLE_PCH})
target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
endif()
@@ -227,6 +234,18 @@ endif()
# filename.
if(${BUILD_SHARED_LIBS})
target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING}
${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR})
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}
${PROJECT_BINARY_DIR}/lib/libhip_hcc.so)
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory
${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
else()
target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
@@ -244,6 +263,7 @@ else()
INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
endif()
INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
-25
Просмотреть файл
@@ -202,19 +202,10 @@ hipError_t DynCO::populateDynGlobalVars() {
return hipErrorSharedObjectSymbolNotFound;
}
if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) {
DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module());
return hipErrorSharedObjectSymbolNotFound;
}
for (auto& elem : var_names) {
vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
}
for (auto& elem : undef_var_names) {
vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr)));
}
return hipSuccess;
}
@@ -377,20 +368,4 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
*size_ptr = dvar->size();
return hipSuccess;
}
hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
amd::ScopedLock lock(sclock_);
for (auto& elem : vars_) {
if ((elem.second->name() == hostVar)
&& (elem.second->module(deviceId) == hmod)) {
*dev_ptr = elem.second->device_ptr(deviceId);
*size_ptr = elem.second->device_size(deviceId);
return hipSuccess;
}
}
return hipErrorNotFound;
}
}; //namespace: hip
-2
Просмотреть файл
@@ -118,8 +118,6 @@ public:
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
size_t* size_ptr);
hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
hipDeviceptr_t* dev_ptr, size_t* size_ptr);
private:
friend class ::PlatformState;
+1 -1
Просмотреть файл
@@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
::strncpy(deviceProps.name, info.boardName_, 128);
deviceProps.totalGlobalMem = info.globalMemSize_;
deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
deviceProps.regsPerBlock = info.availableSGPRs_;
deviceProps.regsPerBlock = info.availableRegistersPerCU_;
deviceProps.warpSize = info.wavefrontWidth_;
deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
+5 -2
Просмотреть файл
@@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
}
FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
: fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) {
: fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) {
guarantee(fname || image);
if (fname != nullptr) {
@@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() {
}
fname_ = std::string();
fdesc_ = -1;
fdesc_ = amd::Os::FDescInit();
fsize_ = 0;
image_ = nullptr;
uri_ = std::string();
@@ -64,6 +64,9 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devi
if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
return hipErrorFileNotFound;
}
if (fsize_ == 0) {
return hipErrorInvalidKernelFile;
}
// Extract the code object from file
hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_,
+3 -1
Просмотреть файл
@@ -5,7 +5,9 @@
#include "hip_code_object.hpp"
#include "platform/program.hpp"
#ifdef ENABLE_HIP_PCH
#ifdef __HIP_ENABLE_PCH
extern const char __hip_pch[];
extern unsigned __hip_pch_size;
void __hipGetPCH(const char** pch, unsigned int *size) {
*pch = __hip_pch;
*size = __hip_pch_size;
-5
Просмотреть файл
@@ -95,11 +95,6 @@ public:
hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
void resize_dVar(size_t size) { dVar_.resize(size); }
//Accessor for device_ptrs.
std::string name() const { return name_; }
hipModule_t module(int deviceId) const { return nullptr; }
hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); }
size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); }
FatBinaryInfo** moduleInfo() { return modules_; };
private:
-2
Просмотреть файл
@@ -252,8 +252,6 @@ extern int ihipGetDevice();
extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
size_t* var_size);
constexpr bool kOptionChangeable = true;
constexpr bool kNewDevProg = false;
+37 -36
Просмотреть файл
@@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
if (*ptr == nullptr) {
size_t free = 0, total =0;
hipMemGetInfo(&free, &total);
LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total);
LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total);
return hipErrorOutOfMemory;
}
@@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
}
} else {
amd::HostQueue* pQueue = &queue;
if (queueDevice != srcMemory->getContext().devices()[0]) {
if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
(queueDevice != srcMemory->getContext().devices()[0])) {
pQueue = hip::getNullStream(srcMemory->getContext());
amd::Command* cmd = queue.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
}
@@ -1850,18 +1850,27 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
hipExtent extent,
hipStream_t stream,
bool isAsync = false) {
if (pitchedDevPtr.pitch == extent.width) {
return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync);
}
// Workaround for cases when pitch > row untill fill kernel will be updated to support pitch.
// Fallback to filling one row at a time.
amd::HostQueue* queue = hip::getQueue(stream);
size_t offset = 0;
amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
auto sizeBytes = extent.width * extent.height * extent.depth;
if (memory == nullptr) {
return hipErrorInvalidValue;
}
if (sizeBytes > memory->getSize()) {
return hipErrorInvalidValue;
}
if (pitchedDevPtr.pitch == extent.width) {
return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast<size_t>(sizeBytes), stream, isAsync);
}
// Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
// Fall back to filling one row at a time.
amd::HostQueue* queue = hip::getQueue(stream);
amd::Coord3D origin(offset);
amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
amd::BufferRect rect;
@@ -1870,34 +1879,26 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
return hipErrorInvalidValue;
}
if (memory != nullptr) {
std::vector<amd::FillMemoryCommand*> commands;
std::vector<amd::FillMemoryCommand*> commands;
for (size_t slice = 0; slice < extent.depth; slice++) {
for (size_t row = 0; row < extent.height; row++) {
const size_t rowOffset = rect.offset(0, row, slice);
amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue,
CL_COMMAND_FILL_BUFFER,
amd::Command::EventWaitList{},
*memory->asBuffer(),
&value,
sizeof(int8_t),
amd::Coord3D{rowOffset, 0, 0},
amd::Coord3D{extent.width, 1, 1});
for (size_t slice = 0; slice < extent.depth; slice++) {
for (size_t row = 0; row < extent.height; row++) {
const size_t rowOffset = rect.offset(0, row, slice);
amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue,
CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { },
*memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset,
0, 0 }, amd::Coord3D { extent.width, 1, 1 });
command->enqueue();
commands.push_back(command);
}
command->enqueue();
commands.push_back(command);
}
}
for (auto &command: commands) {
if (!isAsync) {
command->awaitCompletion();
}
command->release();
for (auto &command : commands) {
if (!isAsync) {
command->awaitCompletion();
}
} else {
return hipErrorInvalidValue;
command->release();
}
return hipSuccess;
@@ -2038,7 +2039,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void
memset(attributes, 0, sizeof(hipPointerAttribute_t));
if (memObj != nullptr) {
attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
if (attributes->memoryType == hipMemoryTypeHost) {
attributes->hostPointer = static_cast<char*>(memObj->getSvmPtr()) + offset;
}
+1 -1
Просмотреть файл
@@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
if (result != hipSuccess) {
break;
}
prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
}
// Sync the execution streams on all devices
+4
Просмотреть файл
@@ -97,6 +97,10 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
uint32_t* linktype, uint32_t* hopcount) {
HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);
if (linktype == nullptr || hopcount == nullptr ||
device1 == device2 || device1 < 0 || device2 < 0) {
HIP_RETURN(hipErrorInvalidValue);
}
// Fill out the list of LinkAttributes
std::vector<amd::Device::LinkAttrType> link_attrs;
link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
+13 -28
Просмотреть файл
@@ -80,27 +80,6 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
return PlatformState::instance().addFatBinary(fbwrapper->binary);
}
bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod,
void** var_addr, size_t* var_size) {
amd::ScopedLock lock(lock_);
if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) {
return true;
}
if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) {
return true;
}
return false;
}
bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
size_t* var_size) {
return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast<hipModule_t>(program),
var_addr, var_size);
}
extern "C" void __hipRegisterFunction(
hip::FatBinaryInfo** modules,
const void* hostFunction,
@@ -686,11 +665,19 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {
return s | v;
}
extern "C" __attribute__((weak)) float __gnu_h2f_ieee(unsigned short h){
extern "C"
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
float __gnu_h2f_ieee(unsigned short h){
return __convert_half_to_float((std::uint32_t) h);
}
extern "C" __attribute__((weak)) unsigned short __gnu_f2h_ieee(float f){
extern "C"
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
unsigned short __gnu_f2h_ieee(float f){
return (unsigned short)__convert_float_to_half(f);
}
@@ -765,6 +752,9 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
DevLogPrintfError("Cannot find the module: 0x%x", hmod);
return hipErrorNotFound;
}
if (0 == strlen(func_name)) {
return hipErrorNotFound;
}
return it->second->getDynFunc(hfunc, func_name);
}
@@ -868,11 +858,6 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi
return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
}
hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr);
}
void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
auto& arguments = execStack_.top().arguments_;
-5
Просмотреть файл
@@ -77,11 +77,6 @@ public:
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
size_t* size_ptr);
hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
hipDeviceptr_t* dev_ptr, size_t* size_ptr);
bool getShadowVarInfo(std::string var_name, hipModule_t hmod,
void** var_addr, size_t* var_size);
//Exec Functions
void setupArgument(const void *arg, size_t size, size_t offset);
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(bit_extract)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(bit_extract bit_extract.cpp)
# Link with HIP
target_link_libraries(bit_extract hip::host)
+3 -7
Просмотреть файл
@@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc
# Show how to use PLATFORM to specify different options for each compiler:
ifeq (${HIP_PLATFORM}, nvcc)
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
endif
EXE=bit_extract
EXE_STATIC=bit_extract_static
$(EXE): bit_extract.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
$(EXE_STATIC): bit_extract.cpp
$(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@
all: $(EXE) $(EXE_STATIC)
all: $(EXE)
clean:
rm -f *.o $(EXE) $(EXE_STATIC)
rm -f *.o $(EXE)
+36
Просмотреть файл
@@ -0,0 +1,36 @@
project(module_api)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(runKernel.hip.out runKernel.cpp)
add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
add_executable(defaultDriver.hip.out defaultDriver.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(runKernel.hip.out codeobj)
add_dependencies(launchKernelHcc.hip.out codeobj)
add_dependencies(defaultDriver.hip.out codeobj)
# Link with HIP
target_link_libraries(runKernel.hip.out hip::host)
target_link_libraries(launchKernelHcc.hip.out hip::host)
target_link_libraries(defaultDriver.hip.out hip::host)
+30
Просмотреть файл
@@ -0,0 +1,30 @@
project(modile_api_global)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(runKernel.hip.out runKernel.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(runKernel.hip.out codeobj)
# Link with HIP
target_link_libraries(runKernel.hip.out hip::host)
+21
Просмотреть файл
@@ -0,0 +1,21 @@
#Follow "README.md" to generate square.cpp if it's missing
project(square)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(square square.cpp)
# Link with HIP
target_link_libraries(square hip::host)
+2 -5
Просмотреть файл
@@ -11,7 +11,7 @@ else
SOURCES=square.cpp
endif
all: square.out square.out.static
all: square.out
# Step
square.cpp: square.cu
@@ -20,8 +20,5 @@ square.cpp: square.cu
square.out: $(SOURCES)
$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
square.out.static: $(SOURCES)
$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
clean:
rm -f *.o *.out *.out.static square.cpp
rm -f *.o *.out square.cpp
+34 -8
Просмотреть файл
@@ -1,13 +1,39 @@
# Square.md
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
1. Add hip/bin path to the PATH :
<code>export PATH=$PATH:[MYHIP]/bin</code>
- Add hip/bin path to the PATH
2. <code>$ make </code>
Make runs these steps. This can be performed on either CUDA or AMD platform:
<code>hipify-perl square.cu > square.cpp </code> # convert cuda code to hip code
<code>hipcc square.cpp</code> # compile into executable
```
$ export PATH=$PATH:[MYHIP]/bin
```
- Define environment variable
```
$ export HIP_PATH=[MYHIP]
```
- Build executible file
```
$ cd ~/hip/samples/0_Intro/square
$ make
/home/user/hip/bin/hipify-perl square.cu > square.cpp
/home/user/hip/bin/hipcc square.cpp -o square.out
/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static
```
- Execute file
```
$ ./square.out
info: running on device Navi 14 [Radeon Pro W5500]
info: allocate host mem ( 7.63 MB)
info: allocate device mem ( 7.63 MB)
info: copy Host2Device
info: launch 'vector_square' kernel
info: copy Device2Host
info: check result
PASSED!
```
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipBusBandwidth)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
# Link with HIP
target_link_libraries(hipBusBandwidth hip::host)
+217 -229
Просмотреть файл
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
bool p_verbose = false;
MallocMode p_malloc_mode = MallocPinned;
int p_numa_ctl = -1;
int p_iterations = 10;
int p_iterations = 0;
int p_beatsperiteration = 1;
int p_device = 0;
int p_detailed = 0;
@@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256,
512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
int nSizes = sizeof(sizes) / sizeof(int);
// iterations to be run for the corresponding sizes, less number as the size increases
int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
// ****************************************************************************
// Function: RunBenchmark_H2D
@@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
"ms", t);
if (p_onesize) {
break;
}
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
@@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
@@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
hipStreamCreate(&stream[0]);
hipStreamCreate(&stream[1]);
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
hipEventRecord(start, 0);
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
"GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
"GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
// Cleanup
@@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipDeviceSynchronize();
hipDeviceSynchronize();
hipEventRecord(start, 0);
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
}
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
}
hipEventRecord(stop, 0);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
}
if (p_onesize) {
break;
}
}
@@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
hipStreamCreate(&stream[0]);
hipStreamCreate(&stream[1]);
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipDeviceSynchronize();
hipDeviceSynchronize();
hipEventRecord(start, 0);
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
hipMemcpyDeviceToDevice, stream[0]);
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
hipMemcpyDeviceToDevice, stream[1]);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
hipMemcpyDeviceToDevice, stream[0]);
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
hipMemcpyDeviceToDevice, stream[1]);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
+31
Просмотреть файл
@@ -0,0 +1,31 @@
project(hipCommander)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipCommander hipCommander.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco
COMMENT "codeobj generated"
)
add_dependencies(hipCommander codeobj)
# Link with HIP
target_link_libraries(hipCommander hip::host)
set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
+35
Просмотреть файл
@@ -0,0 +1,35 @@
project(hipDispatchLatency)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipDispatchLatency hipDispatchLatency.cpp)
add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(hipDispatchLatency codeobj)
add_dependencies(hipDispatchEnqueueRateMT codeobj)
# Link with HIP
target_link_libraries(hipDispatchLatency hip::host)
target_link_libraries(hipDispatchEnqueueRateMT hip::host)
set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipInfo)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipInfo hipInfo.cpp)
# Link with HIP
target_link_libraries(hipInfo hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(MatrixTranspose)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(MatrixTranspose MatrixTranspose.cpp)
# Link with HIP
target_link_libraries(MatrixTranspose hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(inline_asm)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(inline_asm inline_asm.cpp)
# Link with HIP
target_link_libraries(inline_asm hip::host)
+30
Просмотреть файл
@@ -0,0 +1,30 @@
project(texture2dDrv)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(texture2dDrv texture2dDrv.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code
COMMENT "codeobj generated"
)
add_dependencies(texture2dDrv codeobj)
# Link with HIP
target_link_libraries(texture2dDrv hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(occupancy)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(occupancy occupancy.cpp)
# Link with HIP
target_link_libraries(occupancy hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipEvent)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipEvent hipEvent.cpp)
# Link with HIP
target_link_libraries(hipEvent hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(sharedMemory)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(sharedMemory sharedMemory.cpp)
# Link with HIP
target_link_libraries(sharedMemory hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(shfl)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(shfl shfl.cpp)
# Link with HIP
target_link_libraries(shfl hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(2dshfl)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(2dshfl 2dshfl.cpp)
# Link with HIP
target_link_libraries(2dshfl hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(dynamic_shared)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(dynamic_shared dynamic_shared.cpp)
# Link with HIP
target_link_libraries(dynamic_shared hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(stream)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(stream stream.cpp)
# Link with HIP
target_link_libraries(stream hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(peer2peer)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(peer2peer peer2peer.cpp)
# Link with HIP
target_link_libraries(peer2peer hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(unroll)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(unroll unroll.cpp)
# Link with HIP
target_link_libraries(unroll hip::host)
+27
Просмотреть файл
@@ -0,0 +1,27 @@
Build procedure
We provide Makefile and CMakeLists.txt to build the samples seperately.
1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
To build a sample, just type in sample folder,
make
2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
To build a sample, type in sample folder,
mkdir build (if build folder is missing)
cd build
cmake ..
make
If you want debug version, follow,
cmake -DCMAKE_BUILD_TYPE=Debug ..
Обычный файл → Исполняемый файл
+1
Просмотреть файл
@@ -303,6 +303,7 @@ macro(MAKE_TEST _config exe)
add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN})
endif()
set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
endmacro()
macro(MAKE_NAMED_TEST _config exe testname)
+747
Просмотреть файл
@@ -0,0 +1,747 @@
/*
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
#include <hip/hip_vector_types.h>
#include <hip/math_functions.h>
#include <vector>
#include <string>
#include <map>
typedef struct {
double x;
double y;
double width;
} coordRec;
coordRec coords[] = {
{0.0, 0.0, 4.0}, // Whole set
{0.0, 0.0, 0.00001}, // All black
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
template <typename T>
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*(float)i);
float y0 = (float)(yPos + yStep*(float)j);
float x = x0;
float y = y0;
#define FAST
uint iter = 0;
float tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
float savx = x;
float savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16) {
#else
for (iter = 0; stay && (iter < maxIter); iter+=16) {
#endif
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
} while (stay && iter);
}
out[tid] = (uint)ccount;
};
template <typename T>
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*i);
double y0 = (double)(yPos + yStep*j);
double x = x0;
double y = y0;
uint iter = 0;
double tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*(double)i);
double y0 = (double)(yPos + yStep*(double)j);
double x = x0;
double y = y0;
#define FAST
uint iter = 0;
double tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
double savx = x;
double savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16)
#else
for (iter = 0; stay && (iter < maxIter); iter+=16)
#endif
{
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
}
while (stay && iter);
}
out[tid] = (uint)ccount;
};
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
// Expected results for each kernel run at each coord
unsigned long long expectedIters[] = {
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
class hipPerfMandelBrot {
public:
hipPerfMandelBrot();
~hipPerfMandelBrot();
void setNumKernels(unsigned int num) {
numKernels = num;
}
unsigned int getNumKernels() {
return numKernels;
}
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
void open(int deviceID);
void run(unsigned int testCase, unsigned int deviceId);
void printResults(void);
// array of funtion pointers
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
// Wrappers
void float_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
hipStream_t streams[2];
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numKernels;
unsigned int numStreams;
std::map<std::string, std::vector<double>> results;
unsigned int width_;
unsigned int bufSize;
unsigned int maxIter;
unsigned int coordIdx;
volatile unsigned long long totalIters = 0;
int numCUs;
static const unsigned int numLoops = 10;
};
hipPerfMandelBrot::hipPerfMandelBrot() {}
hipPerfMandelBrot::~hipPerfMandelBrot() {}
void hipPerfMandelBrot::open(int deviceId) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
std::cout << "info: didn't find any GPU! skipping the test!\n";
passed();
return;
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
<< std::endl;
numCUs = props.multiProcessorCount;
}
void hipPerfMandelBrot::printResults() {
int numkernels = getNumKernels();
int numStreams = getNumStreams();
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
<< numStreams << " streams (s)" << std::endl;
std::map<std::string, std::vector<double>>:: iterator itr;
for (itr = results.begin(); itr != results.end(); itr++) {
std::cout << "\n" << std::setw(20) << itr->first << " ";
for(auto i : results[itr->first]) {
std::cout << std::setw(10) << i << " ";
}
}
results.clear();
std::cout << std::endl;
}
// Wrappers for the kernel launches
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
maxIter);
}
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
unsigned int numStreams = getNumStreams();
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
// Maximum iteration count
maxIter = 32768;
uint * hPtr[numKernels];
uint * dPtr[numKernels];
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
for (uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Synchronize to make sure all the copies are completed
HIPCHECK(hipStreamSynchronize(0));
int kernelIdx;
if(testCase == 0 || testCase == 5 || testCase == 10) {
kernelIdx = 0;
}
else if(testCase == 1 || testCase == 6 || testCase == 11) {
kernelIdx = 1;
}
else if(testCase == 2 || testCase == 7 || testCase == 12) {
kernelIdx = 2;
}
else if(testCase == 3 || testCase == 8 || testCase == 13){
kernelIdx = 3;
}
double totalTime = 0.0;
for (unsigned int k = 0; k < numLoops; k++) {
coordIdx = testCase % numCoords;
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
testCase == 5 || testCase == 6 || testCase == 7 ||
testCase == 10 || testCase == 11 || testCase == 12)) {
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
else {
double xStep = coords[coordIdx].width / (double)width_;
double yStep = -coords[coordIdx].width / (double)width_;
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
}
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
}
for(uint i = 0; i < numKernels; i++) {
checkData(hPtr[i]);
int j =0;
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
j++;
}
if(j==30) {
std::cout << "Incorrect iteration count detected. ";
}
}
// Compute GFLOPS. There are 7 FLOPs per iteration
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
(totalTime / (double)numLoops);
std::vector<std::string> kernelName = {"float", "float_unroll",
"double", "double_unroll"};
// Print results except for Warm-up kernel
if(testCase!=100) {
results[kernelName[testCase % 4]].push_back(perf);
}
for(uint i = 0 ; i < numStreams; i++) {
HIPCHECK(hipStreamDestroy(streams[i]));
}
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
}
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_; i++) {
ptr2[i] = value;
}
}
void hipPerfMandelBrot::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfMandelBrot mandelbrotCompute;
int deviceId = 0;
mandelbrotCompute.open(deviceId);
for (unsigned int testCase = 0; testCase < 3; testCase++) {
switch (testCase) {
case 0: {
// Warmup-kernel - default stream executes serially
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(100/*Random number*/, deviceId);
break;
}
case 1: {
// run all - sync
int i = 0;
do {
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
case 2: {
// run all - async
int i = 0;
do {
mandelbrotCompute.setNumStreams(2);
mandelbrotCompute.setNumKernels(2);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
default: {
break;
}
}
}
passed();
}
+289
Просмотреть файл
@@ -0,0 +1,289 @@
/*
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
class hipPerfDeviceConcurrency {
public:
hipPerfDeviceConcurrency();
~hipPerfDeviceConcurrency();
void setNumGpus(unsigned int num) {
numDevices = num;
}
unsigned int getNumGpus() {
return numDevices;
}
void open(void);
void close(void);
void run(unsigned int testCase, int numGpus);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numDevices;
unsigned int width_;
unsigned int bufSize;
unsigned int coordIdx;
unsigned long long totalIters = 0;
};
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
void hipPerfDeviceConcurrency::open(void) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
setNumGpus(nGpu);
if (nGpu < 1) {
std::cout << "info: didn't find any GPU! skipping the test!\n";
passed();
}
}
void hipPerfDeviceConcurrency::close() {
}
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
static int deviceId;
uint * hPtr[numGpus];
uint * dPtr[numGpus];
hipStream_t streams[numGpus];
int numCUs[numGpus];
unsigned int maxIter[numGpus];
unsigned long long expectedIters[numGpus];
int threads, threads_per_block, blocks;
float xStep, yStep, xPos, yPos;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, i));
if (testCase != 0) {
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
<< i << std::endl;
}
numCUs[i] = props.multiProcessorCount;
int clkFrequency = 0;
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
clkFrequency =(unsigned int)clkFrequency/1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
maxIter[i] = (maxIter[i] + 15) & ~15;
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
HIPCHECK(hipStreamCreate(&streams[i]));
// Allocate memory on the host and device
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
// Prepare kernel launch parameters
threads = (bufSize/sizeof(uint));
threads_per_block = 64;
blocks = (threads/threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
xStep = (float)(coords[coordIdx].width / (double)width_);
yStep = (float)(-coords[coordIdx].width / (double)width_);
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory from host to device
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
}
for(int i = 0; i < numGpus; i++) {
HIPCHECK(hipStreamSynchronize(0));
}
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
// Copy data back from device to the host
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
checkData(hPtr[i]);
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
if (testCase != 0) {
checkData(hPtr[i]);
if(totalIters != expectedIters[i]) {
std::cout << "Incorrect iteration count detected" << std::endl;
}
}
HIPCHECK(hipStreamDestroy(streams[i]));
// Free host and device memory
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
if (testCase != 0) {
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
}
if(testCase == 0) {
deviceId++;
}
}
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_ ; i++) {
ptr2[i] = value;
}
}
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfDeviceConcurrency deviceConcurrency;
deviceConcurrency.open();
int nGpu = deviceConcurrency.getNumGpus();
// testCase = 0 refers to warmup kernel run
int testCase = 0;
for (int i = 0; i < nGpu; i++) {
// Warm-up kernel on all devices
deviceConcurrency.run(testCase, 1);
}
// Time for kernel on 1 device
deviceConcurrency.run(++testCase, 1);
// Time for kernel on all available devices
deviceConcurrency.run(++testCase, nGpu);
passed();
}
+16 -1
Просмотреть файл
@@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
}
}
void getFactor(int& fact) { fact = 101; }
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
void getFactor(float& fact) { fact = 2.5; }
void getFactor(double& fact) { fact = 2.5; }
void getFactor(long& fact) { fact = 202; }
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
void getFactor(long long& fact) { fact = 303; }
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
template<typename T>
void runTest() {
T* Matrix;
@@ -77,8 +86,10 @@ void runTest() {
cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
// initialize the input data
T factor;
getFactor(factor);
for (i = 0; i < NUM; i++) {
Matrix[i] = (T)i * 10l;
Matrix[i] = (T)i + factor;
}
// allocate the memory on the device side
@@ -124,7 +135,11 @@ void runTest() {
int main() {
runTest<int>();
runTest<float>();
runTest<double>();
runTest<long>();
runTest<long long>();
runTest<unsigned int>();
runTest<unsigned long>();
runTest<unsigned long long>();
passed();
}
+61 -3
Просмотреть файл
@@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) {
a[threadIdx.x] = val;
}
template <typename T>
__global__ void shflXorSum(T* a, int size) {
T val = a[threadIdx.x];
for (int i = size/2; i > 0; i /= 2)
val += __shfl_xor(val, i, size);
a[threadIdx.x] = val;
}
void getFactor(int& fact) { fact = 101; }
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
void getFactor(float& fact) { fact = 2.5; }
void getFactor(double& fact) { fact = 2.5; }
void getFactor(long& fact) { fact = 202; }
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
void getFactor(long long& fact) { fact = 303; }
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
template <typename T>
void runTestShflUp() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i;
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
@@ -73,8 +91,9 @@ void runTestShflDown() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i;
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
@@ -84,19 +103,58 @@ void runTestShflDown() {
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[0] != cpuSum) {
hipFree(d_a);
failed("Shfl Up Sum did not match.");
failed("Shfl Down Sum did not match.");
}
hipFree(d_a);
}
template <typename T>
void runTestShflXor() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
hipMalloc(&d_a, sizeof(T) * size);
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[0] != cpuSum) {
hipFree(d_a);
failed("Shfl Xor Sum did not match.");
}
hipFree(d_a);
}
int main() {
runTestShflUp<int>();
runTestShflUp<float>();
runTestShflUp<double>();
runTestShflUp<long>();
runTestShflUp<long long>();
runTestShflUp<unsigned int>();
runTestShflUp<unsigned long>();
runTestShflUp<unsigned long long>();
runTestShflDown<int>();
runTestShflDown<float>();
runTestShflDown<double>();
runTestShflDown<long>();
runTestShflDown<long long>();
runTestShflDown<unsigned int>();
runTestShflDown<unsigned long>();
runTestShflDown<unsigned long long>();
runTestShflXor<int>();
runTestShflXor<float>();
runTestShflXor<double>();
runTestShflXor<long>();
runTestShflXor<long long>();
runTestShflXor<unsigned int>();
runTestShflXor<unsigned long>();
runTestShflXor<unsigned long long>();
passed();
}
Обычный файл → Исполняемый файл
+3
Просмотреть файл
@@ -395,6 +395,9 @@ int main(int argc, char* argv[]) {
if (gpuCount < 2) {
printf("P2P application requires atleast 2 gpu devices\n");
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
}
} else {
if (p_tests & 0x100) {
testPeerHostToDevice(false /*useAsyncCopy*/);
+280
Просмотреть файл
@@ -0,0 +1,280 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to test how Cooperative Groups kernel
launches work when launching too many warps to the target device. This test
first queries the nominal warp size of the target device. It then walks through
block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For
each of these, it queries the maximum number of blocks that can fit in each SM.
It then queries the number of SMs on the target device. This will yield a
calculation for the maximum number of blocks that can be co-scheduled on this
device.
The Cooperative Groups API says that users should not launch more than this
many warps (or blocks, etc.) to the target device. This test first tires to
launch 2x as many blcoks, to confirm that the runtime prevents such a launch
by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`).
It then ensures that trying to launch too large of a kernel invocation does
not break the GPU by launching a kernel with exactly the maximum number of
blocks.
Finally, we run the same test for a block size that is larger than the maximum
allowed by the device, to ensure that this case is properly detected by the
runtime and that nothing breaks.*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static inline void hipCheckAndFail(hipError_t errval,
const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != hipSuccess) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
failed("");
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
std::cerr << " Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
failed("");
}
}
#define hipCheckErr(errval) \
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
static inline bool hipCheckExpected(hipError_t errval,
hipError_t expected_err, const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != expected_err) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
return false;
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
std::cerr << " Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
return false;
}
return true;
}
static bool cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return false;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return false;
}
return true;
}
__global__ void test_kernel(long long *array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += clock64();
}
int main(int argc, char** argv) {
hipError_t err;
int device_num, FailFlag = 0;
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned int *dev_array[2];
HIPCHECK(hipGetDeviceCount(&device_num));
for (int dev = 0; dev < device_num; ++dev) {
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
HIPCHECK(hipSetDevice(dev));
if (!cooperative_groups_support(dev)) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
/*************************************************************************/
/* Create the streams we will use in this test. **************************/
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
/*************************************************************************/
/* We will try to launch more waves than the GPU can fit. ***************/
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
int max_num_threads = device_properties.maxThreadsPerBlock;
// Check single-thread block, all numbers of warps, then too-large block
for (int block_size = 0; block_size <= (max_num_threads + warp_size);
block_size += warp_size) {
if (block_size == 0) {
block_size = 1;
}
int max_blocks_per_sm;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
&max_blocks_per_sm, test_kernel, block_size, 0,
hipOccupancyDefault));
if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
std::cerr << "blocks in a kernel when the block size is too large ";
std::cerr << "to work on the device." << std::endl;
std::cerr << "This is incorrect, and could possibly lead users ";
std::cerr << "to try to launch kernels that will fail." << std::endl;
//failed("");
FailFlag = 1;
break;
}
int desired_blocks = max_blocks_per_sm * num_sms;
bool expect_fail = false;
if (desired_blocks == 0) {
desired_blocks = 1;
expect_fail = true;
}
/**********************************************************************/
/* Set up data to pass into the kernel ********************************/
for (int i = 0; i < 2; i++) {
int test_size;
// Case where we expect to fail at launch.
if (i == 0) {
test_size = 2 * desired_blocks;
} else {
test_size = desired_blocks;
}
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
test_size * block_size * sizeof(long long)));
HIPCHECK(hipMemsetAsync(dev_array[i], 0,
test_size * block_size * sizeof(long long),
streams[i]));
}
HIPCHECK(hipDeviceSynchronize());
/***********************************************************************/
/* Launch the kernels **************************************************/
void *coop_params[2][1];
for (int i = 0; i < 2; i++) {
coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
}
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
2 * desired_blocks, block_size,
coop_params[0], 0, streams[0]);
hipError_t expect_to_see;
if (expect_fail) {
expect_to_see = hipErrorInvalidConfiguration;
} else {
expect_to_see = hipErrorCooperativeLaunchTooLarge;
}
if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
std::cerr << "ERROR! Tried to launch a cooperative kernel with ";
std::cerr << "too many warps." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << hipGetErrorString(expect_to_see);
std::cerr << " (" << expect_to_see << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
break;
}
HIPCHECK(hipDeviceSynchronize());
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, block_size,
coop_params[1], 0, streams[1]);
if (expect_fail) {
expect_to_see = hipErrorInvalidConfiguration;
} else {
expect_to_see = hipSuccess;
}
if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
std::cerr << "ERROR! Tried to launch a cooperative kernel ";
std::cerr << "with a normal size, but a block size of ";
std::cerr << desired_blocks << std::endl;
std::cerr << "This SHOULD have returned ";
std::cerr << hipGetErrorString(expect_to_see);
std::cerr << " (" << expect_to_see << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
break;
}
HIPCHECK(hipDeviceSynchronize());
if (block_size == 1) {
block_size = 0;
}
for (int m = 0; m < 2; ++m) {
HIPCHECK(hipFree(dev_array[m]));
}
}
for (int m = 0; m < 2; ++m) {
HIPCHECK(hipStreamDestroy(streams[m]));
}
if (FailFlag == 1) {
for (int m = 0; m < 2; ++m) {
HIPCHECK(hipFree(dev_array[m]));
}
failed("");
}
}
passed();
}
+283
Просмотреть файл
@@ -0,0 +1,283 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*
The general idea of the application is to test how Cooperative Groups kernel
launches to a stream interact with other kernels being launched to different
streams.
For example: the HIP runtime will force cooperative kernel launches to run
serially, even if they are launched to different streams. However,
cooperative kernel launches can run in parallel with regular kernels that
are launched to other streams. This limitation is so that the cooperative
kernels do not conflict with one another for resources and potentially
deadlock the system.
As such, this benchmark tests three situations:
1. Launching a cooperative kernel by itself to stream[0]
2. Launching two cooperative kernels in parallel to stream[0] and stream[1]
3. Launching two cooperative kernels in parallel to stream[0] and stream[1]
and launching a third non-cooperative kernel to stream[2]
We time how long it takes to run each of these benchmarks and print it as
the output of the benchmark. The kernels themselves are just useless time-
wasting code so that the kernel takes a meaningful amount of time on the
GPU before it exits. We only launch a single wavefront for each kernel, so
any serialization should not be because of GPU occupancy concerns.
If test #2 takes roughly twice as long as #1, that implies that cooperative
kernels are properly serialized with each other by the runtime.
If test #3 takes the same amount of time as test #2, that implies that
regular kernels can properly run in parallel with cooperative kernels.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* TEST: %t
* HIT_END
*/
#include <chrono>
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static inline void hipCheckAndFail(hipError_t errval,
const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != hipSuccess) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << "Location: " << file << ":" << line << std::endl;
failed("");
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << "Location: " << file << ":" << line << std::endl;
std::cerr << "Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
failed("");
}
}
#define hipCheckErr(errval) \
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long start_clock = clock64();
while (clock64() < (start_clock+1000000)) {}
array[rank] += clock64();
}
}
int main(int argc, char** argv) {
hipError_t err;
/*************************************************************************/
int device_num = 0, loops = 1000, FailFlag = 0;
/* Create the streams we will use in this test. **************************/
hipStream_t streams[3];
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned long long *dev_array[3];
HIPCHECK(hipGetDeviceCount(&device_num));
for (int dev = 0; dev < device_num; ++dev) {
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
HIPCHECK(hipSetDevice(dev));
if (!cooperative_groups_support(dev)) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
/*************************************************************************/
/* We will launch enough waves to fill up all of the GPU *****************/
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
int desired_blocks = 1;
std::cout << "Device: " << dev << std::endl;
std::cout << "Device name: " << device_properties.name << std::endl;
int max_blocks_per_sm;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
test_kernel,
warp_size, 0));
if (desired_blocks > max_blocks_per_sm * num_sms) {
std::cerr << "The requested number of blocks will not fit on the GPU";
std::cerr << std::endl;
std::cerr << "You requested " << desired_blocks << " but we can only ";
std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl;
failed("");
}
/*************************************************************************/
for (int i = 0; i < 3; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
for (int i = 0; i < 3; i++) {
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
warp_size * sizeof(long long)));
HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long),
streams[i]));
}
HIPCHECK(hipDeviceSynchronize());
/*************************************************************************/
/* Launch the kernels ****************************************************/
void *coop_params[3][2];
for (int i = 0; i < 3; i++) {
coop_params[i][0] = reinterpret_cast<void*>(&loops);
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
}
std::cout << "Launching a single cooperative kernel..." << std::endl;
auto single_start = std::chrono::system_clock::now();
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, warp_size,
coop_params[0], 0, streams[0]));
HIPCHECK(hipDeviceSynchronize());
auto single_end = std::chrono::system_clock::now();
std::cout << "Launching 2 cooperative kernels to different streams...";
std::cout << std::endl;
auto double_start = std::chrono::system_clock::now();
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, warp_size,
coop_params[0], 0, streams[0]));
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, warp_size,
coop_params[1], 0, streams[1]));
HIPCHECK(hipDeviceSynchronize());
auto double_end = std::chrono::system_clock::now();
std::cout << "Launching 2 cooperative kernels and 1 normal kernel...";
std::cout << std::endl;
auto triple_start = std::chrono::system_clock::now();
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, warp_size,
coop_params[0], 0, streams[0]));
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, warp_size,
coop_params[1], 0, streams[1]));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size),
0, streams[2], loops, dev_array[2]);
err = hipGetLastError();
hipCheckErr(err);
HIPCHECK(hipDeviceSynchronize());
auto triple_end = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time =
(single_end - single_start);
std::chrono::duration<double> double_kernel_time =
(double_end - double_start);
std::chrono::duration<double> triple_kernel_time =
(triple_end - triple_start);
std::cout << "A single kernel took:" << std::endl;
std::cout << " " << single_kernel_time.count();
std::cout << " seconds" << std::endl;
std::cout << std::endl;
std::cout << "Two cooperative kernels that could run together took:";
std::cout << std::endl;
std::cout << " " << double_kernel_time.count();
std::cout << " seconds" << std::endl;
std::cout << std::endl;
std::cout << "Two coop kernels and a third regular kernel took:";
std::cout << std::endl << " ";
std::cout << triple_kernel_time.count();
std::cout << " seconds" << std::endl;
std::cout << "Testing whether these times make sense.." << std::endl;
// Test that two cooperative kernels is roughly twice as long as one
if (double_kernel_time < 1.8 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Two cooperative kernels launched at the same ";
std::cerr << "time did not take roughly twice as long as a single ";
std::cerr << "cooperative kernel." << std::endl;
std::cerr << "Were they truly serialized?" << std::endl;
FailFlag = 1;
break;
}
// Test that the three kernels together took roughly as long as two
// cooperative kernels.
if (triple_kernel_time > 1.1 * double_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Launching a normal kernel in parallel with two ";
std::cerr << "back-to-back cooperative kernels still ended up taking ";
std::cerr << "more than 10% longer than the two cooperative kernels ";
std::cerr << "alone." << std::endl;
std::cerr << "Is the normal kernel being serialized with the ";
std::cerr << "cooperative kernels on different streams?" << std::endl;
FailFlag = 1;
break;
}
for (int k = 0; k < 3; ++k) {
HIPCHECK(hipFree(dev_array[k]));
HIPCHECK(hipStreamDestroy(streams[k]));
}
}
if (FailFlag == 1) {
for (int k = 0; k < 3; ++k) {
HIPCHECK(hipFree(dev_array[k]));
HIPCHECK(hipStreamDestroy(streams[k]));
}
failed("");
}
passed();
}
+303
Просмотреть файл
@@ -0,0 +1,303 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to create a buffer of width N. N is a
command line parameter, and the user will need to make sure that we can fit
two buffers of N unsigned integers onto the target GPU at the same time.
We then launch a fixed number of warps to the GPU. This number is calculated
to fill the GPU with as many warps as can simultaneously run on the GPU.
The threads in these warps then walk over two arrays. First, values from
A[offset] are added into B[offset]. After all of A is added into all of B
in this element-wise manner, all of the waves barrier with one another.
After the barrier, the waves start adding values from B[mirror_offset] into
A[offset]. Mirror offset means that the wave that is writing into A[7] is
reading from B[7 before the last value]. This was probably written by a
different thread before the barrier.
After going through this loop a certain number of times, the kernel ends and
we read the arrays back out and recalculate this algorithm serially on the
CPU. We compare the serial version to the version that has inter-thread data
sharing and barriers and ensure they result in the same answer.
If they do have the same answer, then we can pretty confidently say that
writing from thread X and then hitting a barrier allows thread Y to see the
values.*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static inline void hipCheckAndFail(hipError_t errval,
const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != hipSuccess) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
exit(errval);
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
std::cerr << " Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
failed("");
}
}
#define hipCheckErr(errval)\
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
static int verify_coop_arrays(unsigned int loops, unsigned int *host_input,
unsigned int *first_array,
unsigned int *second_array,
unsigned int array_len) {
unsigned int *host_first_array = host_input;
unsigned int *host_second_array = (unsigned int*)calloc(array_len,
sizeof(int));
for (int i = 0; i < loops; i++) {
for (int offset = 0; offset < array_len; offset++) {
host_second_array[offset] += host_first_array[offset];
}
for (int offset = 0; offset < array_len; offset++) {
unsigned int swizzle_offset = array_len - offset - 1;
host_first_array[offset] += host_second_array[swizzle_offset];
}
}
for (int i = 0; i < array_len; i++) {
if (host_first_array[i] != first_array[i]) {
std::cerr << "Test failure!" << std::endl;
std::cerr << " host_first_array[" << i << "] contains the ";
std::cerr << "value " << host_first_array[i] << std::endl;
std::cerr << " GPU first_array[" << i << "] contains the ";
std::cerr << "value " << first_array[i] << std::endl;
return -1;
}
if (host_second_array[i] != second_array[i]) {
std::cerr << "Test failure!" << std::endl;
std::cerr << " host_second_array[" << i << "] contains the ";
std::cerr << "value " << host_second_array[i] << std::endl;
std::cerr << " GPU second_array[" << i << "] contains the ";
std::cerr << "value " << second_array[i] << std::endl;
return -1;
}
}
std::cout << "Coop test appears to work properly!" << std::endl;
free(host_second_array);
return 0;
}
__global__ void
coop_kernel(unsigned int *first_array, unsigned int *second_array,
unsigned int loops, unsigned int array_len) {
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
unsigned int rank = grid.thread_rank();
unsigned int grid_size = grid.size();
for (int i = 0; i < loops; i++) {
// The goal of this loop is to directly add in values from
// array one into array two, on a per-wave basis.
for (int offset = rank; offset < array_len; offset += grid_size) {
second_array[offset] += first_array[offset];
}
grid.sync();
// The goal of this loop is to pull data the "mirror" lane in
// array two and add it back into array one. This causes inter-
// thread swizzling.
for (int offset = rank; offset < array_len; offset += grid_size) {
unsigned int swizzle_offset = array_len - offset - 1;
first_array[offset] += second_array[swizzle_offset];
}
grid.sync();
}
}
int main(int argc, char** argv) {
hipError_t err;
/*************************************************************************/
/* Parse the command line parameters *************************************/
// Arguments to pull out of the command line.
int device_num = 0, loops = 2, width = 4096, flag = 0;
HIPCHECK(hipGetDeviceCount(&device_num));
for (int dev = 0; dev < device_num; ++dev) {
std::cout << "Device number: " << dev << std::endl;
std::cout << "Loops: " << loops << std::endl;
std::cout << "Width: " << width << std::endl;
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
HIPCHECK(hipSetDevice(dev));
if (!cooperative_groups_support(dev)) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
/*************************************************************************/
/* We will launch enough waves to fill up all of the GPU *****************/
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
std::cout << "Device name: " << device_properties.name << std::endl;
std::cout << std::endl;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
coop_kernel,
warp_size, 0));
int total_blocks = max_blocks_per_sm * num_sms;
/*************************************************************************/
/* Create the streams we will use in this test. **************************/
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned int *input_buffer = (unsigned int*)calloc(width,
sizeof(unsigned int));
for (int i = 0; i < width; i++) {
input_buffer[i] = i;
}
unsigned int *first_dev_array;
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&first_dev_array),
width * sizeof(unsigned int)));
HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer,
width * sizeof(unsigned int),
hipMemcpyHostToDevice, streams[0]));
unsigned int *second_dev_array;
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&second_dev_array),
width * sizeof(unsigned int)));
HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int),
streams[0]));
/*************************************************************************/
/* Launch the kernels ****************************************************/
std::cout << "Launching a cooperative kernel with " << total_blocks;
std::cout << " thread blocks, each with " << warp_size << " threads";
std::cout << std::endl;
void *coop_params[4];
coop_params[0] = reinterpret_cast<void*>(&first_dev_array);
coop_params[1] = reinterpret_cast<void*>(&second_dev_array);
coop_params[2] = reinterpret_cast<void*>(&loops);
coop_params[3] = reinterpret_cast<void*>(&width);
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(coop_kernel),
total_blocks, warp_size, coop_params,
0, streams[0]));
/*************************************************************************/
/* Read back the buffers and print out their data ************************/
unsigned int *first_array = (unsigned int*)calloc(width,
sizeof(unsigned int));
unsigned int *second_array = (unsigned int*)calloc(width,
sizeof(unsigned int));
HIPCHECK(hipMemcpyAsync(first_array, first_dev_array,
width * sizeof(unsigned int),
hipMemcpyDeviceToHost, streams[0]));
HIPCHECK(hipMemcpyAsync(second_array, second_dev_array,
width * sizeof(unsigned int),
hipMemcpyDeviceToHost, streams[0]));
std::cout << "Waiting for cooperative work to finish..." << std::endl;
std::cout << std::flush;
HIPCHECK(hipStreamSynchronize(streams[0]));
int ret_val = 0;
std::cout << "Attemping to verify buffers." << std::endl;
std::cout << std::flush;
ret_val = verify_coop_arrays(loops, input_buffer, first_array,
second_array, width);
if (!ret_val) {
std::cout << "It appears that inter-thread data sharing at ";
std::cout << "grid_group sync points works properly!" << std::endl;
} else {
flag = 1;
}
for (int k = 0; k < 2; ++k) {
HIPCHECK(hipStreamDestroy(streams[k]));
}
HIPCHECK(hipFree(first_dev_array));
HIPCHECK(hipFree(second_dev_array));
free(input_buffer);
free(first_array);
free(second_array);
}
if (!flag) {
passed();
} else {
failed("");
}
}
+6 -2
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -139,7 +139,11 @@ int main()
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
passed();
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
+6 -2
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -139,7 +139,11 @@ int main()
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
passed();
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
+6 -2
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -139,7 +139,11 @@ int main()
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
passed();
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
+18 -4
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -34,6 +34,8 @@ THE SOFTWARE.
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
using namespace cooperative_groups;
@@ -193,15 +195,27 @@ static void test_cg_multi_grid_group_type(int blockSize)
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
//ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
//ASSERT_EQUAL(gridRankTestH[i][j], i);
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -34,11 +34,14 @@ THE SOFTWARE.
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
sizeTestD[gIdx] = tg.size();
// Test thread_rank
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = tg.thread_rank();
// Test is_valid
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
}
// Launch Kernel
constexpr int NumKernelArgs = 5;
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &thdRankTestD[i];
args[i * NumKernelArgs + 2] = &isValidTestD[i];
args[i * NumKernelArgs + 3] = &syncTestD[i];
args[i * NumKernelArgs + 4] = &syncResultD;
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
launchParamsList[i].gridDim = 2;
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -34,11 +34,14 @@ THE SOFTWARE.
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
sizeTestD[gIdx] = group_size(mg);
// Test thread_rank api
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = thread_rank(mg);
// Test is_valid api
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
}
// Launch Kernel
constexpr int NumKernelArgs = 5;
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &thdRankTestD[i];
args[i * NumKernelArgs + 2] = &isValidTestD[i];
args[i * NumKernelArgs + 3] = &syncTestD[i];
args[i * NumKernelArgs + 4] = &syncResultD;
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
launchParamsList[i].gridDim = 2;
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
hipSuccess);
ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
+11 -1
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -166,6 +166,16 @@ int main()
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
// Test block sizes which are powers of 2
int i = 0;
while (true) {
+11 -1
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -135,6 +135,16 @@ int main()
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
// Test block sizes which are powers of 2
int i = 0;
while (true) {
+11 -1
Просмотреть файл
@@ -22,7 +22,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -135,6 +135,16 @@ int main()
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
// Test block sizes which are powers of 2
int i = 0;
while (true) {
@@ -20,7 +20,7 @@ THE SOFTWARE.
// Simple test for hipLaunchCooperativeKernelMultiDevice API.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
* TEST: %t
* HIT_END
*/
@@ -22,15 +22,14 @@ THE SOFTWARE.
// Simple test for hipLaunchCooperativeKernel API.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* TEST: %t
* HIT_END
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include "hip/hcc_detail/device_library_decls.h"
#include "hip/hcc_detail/hip_cooperative_groups.h"
#include "hip/hip_cooperative_groups.h"
#include <iostream>
#include <chrono>
#include "test_common.h"
@@ -129,7 +128,7 @@ int main() {
params[3] = (void*)&dC;
std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
+568
Просмотреть файл
@@ -0,0 +1,568 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to test how Cooperative Groups kernel
launches work when launching too many warps to multiple target devices. This
tests the following failure modes for hipLaunchCooperativeKernelMultiDevice:
1) Do not launch more warps to any device than can fit on that device
2) All device targets for the multi-device launch function must be different
3) All streams must be explicit (non-NULL)
4) The kernels sent in must be identical between devices
5) The grid and block sizes must be identical between devices
6) The block dimensions must be non-zero
7) The dynamic shared memory size must be identical between devices.
This test ensures that the proper error conditions are returned, even if the
target kernel does not actually use any fo the cooperative groups features.
Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm
do not have these constraints. As such, the test checks to see whether they
should fail or succeed and compares this to what actually happens.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static inline void hipCheckAndFail(hipError_t errval,
const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != hipSuccess) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
failed("");
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
std::cerr << " Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
failed("");
}
}
#define hipCheckErr(errval) \
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
int multi_gpu_cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
if (!multi_gpu_cooperative_attribute) {
std::cerr << "Multi-GPU cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
std::cerr << "Multi-GPU cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
static int support_for_separate_kernels(int device_id) {
hipError_t err;
int separate_kernel_supported;
HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported,
hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,
device_id));
if (!separate_kernel_supported) {
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) {
return 0;
}
return 1;
}
static int support_for_separate_grid_sizes(int device_id) {
hipError_t err;
int separate_sizes_supported;
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,
device_id));
if (!separate_sizes_supported) {
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) {
return 0;
}
return 1;
}
static int support_for_separate_block_dims(int device_id) {
hipError_t err;
int separate_sizes_supported;
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,
device_id));
if (!separate_sizes_supported) {
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) {
return 0;
}
return 1;
}
static int support_for_separate_shared_sizes(int device_id) {
hipError_t err;
int separate_sizes_supported;
HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,
device_id));
if (!separate_sizes_supported) {
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) {
return 0;
}
return 1;
}
__global__ void test_kernel(long long *array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += clock64();
}
__global__ void second_test_kernel(long long *array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += clock64();
}
int main(int argc, char** argv) {
hipError_t err;
/*************************************************************************/
/* Parse the command line parameters *************************************/
// Arguments to pull out of the command line.
int device_num, FailFlag = 0;
HIPCHECK(hipGetDeviceCount(&device_num));
if (device_num < 2) {
std::cout << "This test requires atleast two gpus but the system has ";
std::cout << " only "<< device_num <<std::endl;
std::cout << "The test is skipping with Pass result" << std::endl;
passed();
}
for (int dev = 0; dev < (device_num-1); ++dev) {
std::cout << "First device number: " << dev << std::endl;
std::cout << "Second device number: " << (dev + 1) << std::endl;
/*************************************************************************/
/* Test whether target devices support cooperative groups ****************/
for (int i = 0; i < 2; i++) {
if (!cooperative_groups_support((dev + i))) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
}
/*************************************************************************/
/* We will try to launch more waves than the GPUs can fit. ***************/
int warp_sizes[2];
int num_sms[2];
hipDeviceProp_t device_properties[2];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
std::cout << "Device " << (dev + i);
std::cout << " name: " << device_properties[i].name << std::endl;
}
std::cout << std::endl;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[2];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice((dev + i)));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int desired_blocks = max_blocks_per_sm * num_sm;
/*************************************************************************/
/* Create the streams we will use in this test. **************************/
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice((dev + i)));
HIPCHECK(hipStreamCreate(&streams[i]));
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
// Alocate the host input buffer, and two device-focused buffers per GPU
// that we will use for our test.
unsigned int *good_dev_array[2];
unsigned int *bad_dev_array[2];
for (int i = 0; i < 2; i++) {
int good_size = desired_blocks * warp_size * sizeof(long long);
int bad_size = 2 * desired_blocks * warp_size * sizeof(long long);
HIPCHECK(hipSetDevice((dev + i)));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&good_dev_array[i]),
good_size));
HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i]));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&bad_dev_array[i]),
bad_size));
HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i]));
}
HIPCHECK(hipDeviceSynchronize());
/*************************************************************************/
/* Launch the kernels ****************************************************/
std::cout << "Launching a multi-GPU cooperative kernel with too many ";
std::cout << "warps..." << std::endl;
void *dev_params[2][1];
hipLaunchParams md_params[2];
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].gridDim = 2 * desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if (err != hipErrorCooperativeLaunchTooLarge) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with too many warps." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorCooperativeLaunchTooLarge (";
std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
} else {
std::cout << "\tProperly saw this return ";
std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl;
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel to the same ";
std::cout << "device twice..." << std::endl;
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&good_dev_array[i]);
md_params[i].gridDim = desired_blocks;
md_params[i].stream = streams[0];
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if (err != hipErrorInvalidDevice) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "to the same device twice." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidDevice (";
std::cerr << hipErrorInvalidDevice << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
} else {
std::cout << "\tProperly saw this return ";
std::cout << "hipErrorInvalidDevice" << std::endl;
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel to the NULL ";
std::cout << "stream" << std::endl;
for (int i = 0; i < 2; i++) {
md_params[i].stream = NULL;
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if (err != hipErrorInvalidResourceHandle) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "to the NULL stream." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidResourceHandle (";
std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
} else {
std::cout << "\tProperly saw this return ";
std::cout << "hipErrorInvalidResourceHandle" << std::endl;
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with two ";
std::cout << "different kernels." << std::endl;
bool supports_sep_kernels = true;
for (int i = 0; i < 2; i++) {
md_params[i].stream = streams[i];
if (!support_for_separate_kernels((dev + i))) {
supports_sep_kernels = false;
}
}
md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if ((supports_sep_kernels && err != hipSuccess) ||
(!supports_sep_kernels && err != hipErrorInvalidValue)) {
if (supports_sep_kernels) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different kernels." << std::endl;
std::cerr << "This SHOULD have succeeded with hipSuccess (";
std::cerr << hipSuccess << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
} else {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different kernels." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidValue (";
std::cerr << hipErrorInvalidValue << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
}
FailFlag = 1;
} else {
std::cout << "\tProperly saw this return ";
if (supports_sep_kernels) {
std::cout << "hipSuccess" << std::endl;
} else {
std::cout << "hipErrorInvalidValue" << std::endl;
}
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with two ";
std::cout << "different grid sizes." << std::endl;
bool supports_sep_sizes = true;
for (int i = 0; i < 2; i++) {
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].gridDim = i+1;
if (!support_for_separate_grid_sizes((dev + i))) {
supports_sep_sizes = false;
}
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if ((supports_sep_sizes && err != hipSuccess) ||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
if (supports_sep_sizes) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different grid sizes." << std::endl;
std::cerr << "This SHOULD have succeeded with hipSuccess (";
std::cerr << hipSuccess << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
} else {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different grid sizes." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidValue (";
std::cerr << hipErrorInvalidValue << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
}
} else {
std::cout << "\tProperly saw this return ";
if (supports_sep_kernels) {
std::cout << "hipSuccess" << std::endl;
} else {
std::cout << "hipErrorInvalidValue" << std::endl;
}
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with two ";
std::cout << "different block dimensions." << std::endl;
supports_sep_sizes = true;
for (int i = 0; i < 2; i++) {
md_params[i].gridDim = desired_blocks;
md_params[i].blockDim = i+1;
if (!support_for_separate_block_dims((dev + i))) {
supports_sep_sizes = false;
}
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if ((supports_sep_sizes && err != hipSuccess) ||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
if (supports_sep_sizes) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different block dimensions." << std::endl;
std::cerr << "This SHOULD have succeeded with hipSuccess (";
std::cerr << hipSuccess << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
} else {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different block dimensions." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidValue (";
std::cerr << hipErrorInvalidValue << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
}
} else {
std::cout << "\tProperly saw this return ";
if (supports_sep_kernels) {
std::cout << "hipSuccess" << std::endl;
} else {
std::cout << "hipErrorInvalidValue" << std::endl;
}
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with block ";
std::cout << "dimensions of zero." << std::endl;
for (int i = 0; i < 2; i++) {
md_params[i].blockDim = 0;
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if (err != hipErrorInvalidConfiguration) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with block dimensions of zero." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidConfiguration (";
std::cerr << hipErrorInvalidConfiguration << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
} else {
std::cout << "\tProperly saw this return ";
std::cout << "hipErrorInvalidConfiguration" << std::endl;
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with two ";
std::cout << "different shared memory sizes." << std::endl;
supports_sep_sizes = true;
for (int i = 0; i < 2; i++) {
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = i;
if (!support_for_separate_shared_sizes((dev + i))) {
supports_sep_sizes = false;
}
}
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if ((supports_sep_sizes && err != hipSuccess) ||
(!supports_sep_sizes && err == hipErrorInvalidValue)) {
if (supports_sep_sizes) {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different shared memory sizes." << std::endl;
std::cerr << "This SHOULD have succeeded with hipSuccess (";
std::cerr << hipSuccess << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
} else {
std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
std::cerr << "with two different shared memory sizes." << std::endl;
std::cerr << "This SHOULD have failed with the error ";
std::cerr << "hipErrorInvalidValue (";
std::cerr << hipErrorInvalidValue << ")." << std::endl;
std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
std::cerr << " (" << err << ")" << std::endl;
FailFlag = 1;
}
} else {
std::cout << "\tProperly saw this return ";
if (supports_sep_kernels) {
std::cout << "hipSuccess" << std::endl;
} else {
std::cout << "hipErrorInvalidValue" << std::endl;
}
}
HIPCHECK(hipDeviceSynchronize());
std::cout << "Launching a multi-GPU cooperative kernel with maximum ";
std::cout << "number of warps..." << std::endl;
for (int i = 0; i < 2; i++) {
md_params[i].sharedMem = 0;
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
std::cout << "\tProperly launched." << std::endl;
HIPCHECK(hipDeviceSynchronize());
for (int m = 0; m < 2; ++m) {
HIPCHECK(hipFree(good_dev_array[m]));
HIPCHECK(hipFree(bad_dev_array[m]));
HIPCHECK(hipStreamDestroy(streams[m]));
}
if (FailFlag == 1) {
break;
}
}
if (FailFlag == 1) {
failed("");
} else {
passed();
}
}
+581
Просмотреть файл
@@ -0,0 +1,581 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to test how multi-GPU Cooperative
Groups kernel launches to a stream interact with other things that may be
simultaneously running in the same streams.
The HIP specification says that a multi-GPU cooperative launch will wait
until all of the streams it's using finish their work. Only then will the
cooperative kernel be launched to all of the devices. Then no other work
can take part in the any of the streams until all of the multi-GPU
cooperative work is done.
However, there are flags that allow you to disable each of these
serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
hipCooperativeLaunchMultiDeviceNoPostSync.
As such, this benchmark tests the following five situations launching
to two GPUs (and thus two streams):
1. Normal multi-GPU cooperative kernel:
This should result in the following pattern:
Stream 0: Cooperative
Stream 1: Cooperative
2. Regular kernel launches and multi-GPU cooperative kernel launches
with the default flags, resulting in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: --> Cooperative --> Regular
3. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off "pre-sync". This should allow a cooperative kernel
to launch even if work is already in a stream pointing to
another GPU.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: Cooperative --> Regular
4. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off "post-sync". This should allow a new kernel to enter
a GPU even if another GPU still has a cooperative kernel on it.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: --> Cooperative--> Regular
5. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off both pre- and post-sync. This should allow any of
the kernels to launch to their GPU regardless of the status of
other kernels in other multi-GPU stream groups.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: Cooperative --> Regular
We time how long it takes to run each of these benchmarks and print it as
the output of the benchmark. The kernels themselves are just useless time-
wasting code so that the kernel takes a meaningful amount of time on the
GPU before it exits. We only launch a single wavefront for each kernel, so
any serialization should not be because of GPU occupancy concerns.
If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
cooperative kernels are serialized as expected.
If test #5 takes roughly twice as long as #1, that implies that the
overlap-allowing flags work as expected.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
* TEST: %t
* HIT_END
*/
#include <chrono>
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static inline void hipCheckAndFail(hipError_t errval,
const char *file, int line) {
hipError_t last_err = hipGetLastError();
if (errval != hipSuccess) {
std::cerr << "hip error: " << hipGetErrorString(errval);
std::cerr << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
failed("");
}
if (last_err != errval) {
std::cerr << "Error: the return value of a function was not the same ";
std::cerr << "as the value returned by hipGetLastError()" << std::endl;
std::cerr << " Location: " << file << ":" << line << std::endl;
std::cerr << " Function returned: " << hipGetErrorString(errval);
std::cerr << " (" << errval << ")" << std::endl;
std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
std::cerr << " (" << last_err << ")" << std::endl;
failed("");
}
}
#define hipCheckErr(errval) \
do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
int multi_gpu_cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
if (!multi_gpu_cooperative_attribute) {
std::cerr << "Multi-GPU cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
std::cerr << "Multi-GPU cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
__global__ void test_coop_kernel(unsigned int loops, long long *array,
int fast_gpu) {
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long start_clock = clock64();
while (clock64() < (start_clock+1000000)) {}
array[rank] += clock64();
}
}
__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long start_clock = clock64();
while (clock64() < (start_clock+1000000)) {}
array[rank] += clock64();
}
}
int main(int argc, char** argv) {
hipError_t err;
int device_num, FailFlag = 0;
uint32_t loops = 2000;
uint32_t fast_loops = 1;
int32_t fast_gpu = -1;
HIPCHECK(hipGetDeviceCount(&device_num));
if (device_num < 2) {
std::cout << "This test requires atleast two gpus but the system has ";
std::cout << " only "<< device_num <<std::endl;
std::cout << "The test is skipping with Pass result" << std::endl;
passed();
}
for (int dev = 0; dev < (device_num-1); ++dev) {
std::cout << "First device number: " << dev << std::endl;
std::cout << "Second device number: " << (dev + 1) << std::endl;
std::cout << "Loops: " << loops << std::endl;
/*************************************************************************/
/* Test whether target devices support cooperative groups ****************/
for (int i = 0; i < 2; i++) {
if (!cooperative_groups_support(dev + i)) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
}
/*************************************************************************/
/* We will launch enough waves to fill up all of the GPU *****************/
int warp_sizes[2];
int num_sms[2];
hipDeviceProp_t device_properties[2];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
std::cout << "Device " << (i + 1);
std::cout << " name: " << device_properties[i].name << std::endl;
}
std::cout << std::endl;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[2];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int desired_blocks = 1;
if (desired_blocks > max_blocks_per_sm * num_sm) {
std::cerr << "The requested number of blocks will not fit on the GPU";
std::cerr << std::endl;
std::cerr << "You requested " << desired_blocks << " but we can only ";
std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl;
failed("");
}
/*************************************************************************/
/* Create the streams we will use in this test. **************************/
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipStreamCreate(&streams[i]));
}
/*************************************************************************/
/* Set up data to pass into the kernelx **********************************/
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned long long *dev_array[2];
for (int i = 0; i < 2; i++) {
int good_size = desired_blocks * warp_size * sizeof(long long);
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
}
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
/*************************************************************************/
/* Launch the kernels ****************************************************/
void *dev_params[2][3];
hipLaunchParams md_params[2];
std::chrono::time_point<std::chrono::system_clock> start_time[6];
std::chrono::time_point<std::chrono::system_clock> end_time[6];
std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n";
std::cout << "This should result in the following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&loops);
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
md_params[i].gridDim = desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
start_time[0] = std::chrono::system_clock::now();
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[0] = std::chrono::system_clock::now();
std::cout << std::endl;
std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the ";
std::cout << "following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
std::cout << "GPU " << (dev + 1) << ": --> Coop ";
std::cout << "--> Standard Kernel\n";
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
fast_gpu = -1;
std::cout << std::endl;
std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the ";
std::cout << "following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl;
std::cout << "GPU " << (dev + 1) << ": --> Long Coop";
std::cout << " Kernel --> ";
std::cout << "Standard Kernel\n";
fast_gpu = 0;
start_time[2] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[2] = std::chrono::system_clock::now();
fast_gpu = -1;
std::cout << std::endl;
std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the ";
std::cout << "ability to overlap regular and cooperative kernels ";
std::cout << "only at the beginning." << std::endl;
std::cout << "This should result in the following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Standard Kernel --> Coop" << std::endl;
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
std::cout<< " Kernel\n";
fast_gpu = 0;
start_time[3] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[3] = std::chrono::system_clock::now();
fast_gpu = -1;
std::cout << std::endl;
std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the ";
std::cout << "ability to overlap regular and cooperative kernels ";
std::cout << "only at the end." << std::endl;
std::cout << "This should result in the following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
std::cout << "GPU " << (dev + 1) << ": --> Coop --> ";
std::cout << "Standard Kernel\n";
fast_gpu = 1;
start_time[4] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPostSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[4] = std::chrono::system_clock::now();
fast_gpu = -1;
std::cout << std::endl;
std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the ";
std::cout << "ability to overlap regular and cooperative kernels";
std::cout << std::endl;
std::cout << "This should result in the following pattern:" << std::endl;
std::cout << "GPU " << dev << ": Standard Kernel --> Long Coop Kernel\n";
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
std::cout << " Kernel\n";
start_time[5] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync |
hipCooperativeLaunchMultiDeviceNoPostSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
HIPCHECK(hipDeviceSynchronize());
}
end_time[5] = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time =
(end_time[0] - start_time[0]);
std::chrono::duration<double> serialized_gpu0_time =
(end_time[1] - start_time[1]);
std::chrono::duration<double> serialized_gpu1_time =
(end_time[2] - start_time[2]);
std::chrono::duration<double> pre_overlapped_time =
(end_time[3] - start_time[3]);
std::chrono::duration<double> post_overlapped_time =
(end_time[4] - start_time[4]);
std::chrono::duration<double> overlapped_time =
(end_time[5] - start_time[5]);
std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl;
std::cout << " " << single_kernel_time.count();
std::cout << " seconds" << std::endl;
std::cout << std::endl;
std::cout << "Test 1: Serialized set of three kernels with GPU0";
std::cout << " being long took:";
std::cout << " " << serialized_gpu0_time.count();
std::cout << " seconds" << std::endl;
std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
std::cerr << " and ";
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
std::cout << std::endl;
std::cout << "Test 2: Serialized set of three kernels with GPU1";
std::cout << " being long took:" << std::endl;
std::cout << " " << serialized_gpu1_time.count();
std::cout << " seconds" << std::endl;
std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
std::cerr << " and ";
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
std::cout << std::endl;
std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n";
std::cout << " " << pre_overlapped_time.count();
std::cout << " seconds" << std::endl;
std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
std::cerr << " and ";
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
std::cout << std::endl;
std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n";
std::cout << " " << post_overlapped_time.count();
std::cout << " seconds" << std::endl;
std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
std::cerr << " and ";
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.";
std::cout << std::endl;
std::cout << "Test 5: Multiple kernels with overlap allowed took:\n";
std::cout << " " << overlapped_time.count();
std::cout << " seconds" << std::endl;
std::cerr << "Expect between " << (1.8 * single_kernel_time.count());
std::cerr << " and ";
std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
// Test that fully not-overlapped kernels take roughly 3x as long as one
// cooperative kernel.
if (serialized_gpu0_time > 3.3 * single_kernel_time ||
serialized_gpu0_time < 2.7 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Test 1, the first case where all kernels should be ";
std::cerr << "serialized, had a runtime that was very different ";
std::cerr << "than what was expected." << std::endl;
std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n";
std::cerr << "Expected between ";
std::cerr << (2.7 * single_kernel_time.count()) << " and ";
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
std::cerr << "Were they truly serialized?" << std::endl;
FailFlag = 1;
}
// Test that fully not-overlapped kernels take roughly 3x as long as one
// cooperative kernel.
if (serialized_gpu1_time > 3.3 * single_kernel_time ||
serialized_gpu1_time < 2.7 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Test 2, the second case where all kernels should be ";
std::cerr << "serialized, had a runtime that was very different ";
std::cerr << "than what was expected." << std::endl;
std::cerr << "Was " << serialized_gpu1_time.count();
std::cerr << " seconds." << std::endl;
std::cerr << "Expected between ";
std::cerr << (2.7 * single_kernel_time.count()) << " and ";
std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
std::cerr << "Were they truly serialized?" << std::endl;
FailFlag = 1;
}
// Test that kernels that can overlap only before the cooperative kernel
// launches kernels take roughly the same time (in this case)
if (pre_overlapped_time > 2.3 * single_kernel_time ||
pre_overlapped_time < 1.7 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Test 3, the case where the last kernel is serialized, had ";
std::cerr << "a runtime that was very different than what was ";
std::cerr << "expected." << std::endl;
std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n";
std::cerr << "Expected between ";
std::cerr << (1.7 * single_kernel_time.count()) << " and ";
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
FailFlag = 1;
}
// Test that kernels that can overlap only after the cooperative kernel
// launches kernels take roughly the same time (in this case)
if (post_overlapped_time > 2.3 * single_kernel_time ||
post_overlapped_time < 1.7 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Teste 4, the case where the first kernel is ";
std::cerr << "serialized, had a runtime that was very different ";
std::cerr << "than what was expected." << std::endl;
std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n";
std::cerr << "Expected between ";
std::cerr << (1.7 * single_kernel_time.count()) << " and ";
std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
FailFlag = 1;
}
// Test that, with the right flags on the kernel launch, that we prevent
// incomplete launches from serializing the cooperative launch streams.
if (overlapped_time > 2.2 * single_kernel_time ||
overlapped_time < 1.8 * single_kernel_time) {
std::cerr << "ERROR!" << std::endl;
std::cerr << "Test 5, the case where normal and cooperative kernel ";
std::cerr << "launches should overlap, does not appear to have done so.";
std::cerr << std::endl;
std::cerr << "Was " << overlapped_time.count() << " seconds.\n";
std::cerr << "Expected between ";
std::cerr << (1.8 * single_kernel_time.count()) << " and ";
std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
std::cerr << "Is the normal kernel being serialized with the ";
std::cerr << "cooperative kernels on different streams?" << std::endl;
FailFlag = 1;
}
for (int k = 0; k < 2; ++k) {
HIPCHECK(hipFree(dev_array[k]));
HIPCHECK(hipStreamDestroy(streams[k]));
}
if (FailFlag == 1) {
break;
}
}
if (FailFlag == 1) {
failed("");
} else {
passed();
}
}
+374
Просмотреть файл
@@ -0,0 +1,374 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to launch N warps to all GPUs detected
in the HIP system. N is a command-line parameter, but the user should set N
small enough that all warps can be on each of the GPUs at the same time.
All of the warps do a "work loop". Within the work loop, every warp
atomically increments a global variable that is shared between both fo the
target GPUs. The value returned from this atomic increment entriely depends
on the order the warps from the GPUs arrive at the atomic instruction. Each
warp then stores the result into a global array based on its warp ID.
We also add a sleep/wait loop into the code so that the last warp runs much
slower than everyone else. As such, it should store much larger values than
all the other warps.
If there are no barrier within the loop, then warp 0 will likely ge to the
global variable the first time while all the other warps have each
incremented it many times. If the barrier properly works, then each warp
will increment the variable once per time through the loop, and all threads
will sleep on the barrier waiting for the last warp to finally catch up.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
int multi_gpu_cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
if (!multi_gpu_cooperative_attribute) {
std::cerr << "Multi-GPU cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
std::cerr << "Multi-GPU cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
unsigned int *host_buffer,
unsigned int num_devs) {
unsigned int max_in_this_loop = 0;
for (unsigned int i = 0; i < loops; i++) {
max_in_this_loop += (warps * num_devs);
for (unsigned int j = 0; j < warps; j++) {
if (host_buffer[i*warps+j] > max_in_this_loop) {
std::cerr << "Barrier failure!" << std::endl;
std::cerr << " Buffer entry " << i*warps+j;
std::cerr << " contains the value " << host_buffer[i*warps+j];
std::cerr << " but it should not be more than ";
std::cerr << max_in_this_loop << std::endl;
return -1;
}
}
}
std::cout << "\tBarriers work properly!" << std::endl;
return 0;
}
static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
unsigned int desired_val = 0;
for (int i = 0; i < loops; i++) {
if (i % 2 == 0) {
desired_val += 2;
} else {
desired_val *= 2;
}
}
std::cout << "Desired value is " << desired_val << std::endl;
if (array_val != desired_val) {
std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
std::cerr << std::endl;
std::cerr << "Expected the multi-GPUs to work together to produce ";
std::cerr << "the value " << desired_val << std::endl;
std::cerr << "However, the entry returned from the multi-GPU ";
std::cerr << "kernel was " << array_val << std::endl;
return -1;
}
std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
return 0;
}
__global__ void
test_kernel(unsigned int *atomic_val, unsigned int *global_array,
unsigned int *array, uint32_t loops) {
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long start_clock = clock64();
while (clock64() < (start_clock+1000000)) {}
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long start_clock = clock64();
while (clock64() < (start_clock+100000000)) {}
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
}
int main(int argc, char** argv) {
hipError_t err;
int num_devices = 0;
uint32_t loops = 2;
uint32_t warps = 10;
uint32_t block_size = 1;
std::cout << "Loops: " << loops << std::endl;
std::cout << "Warps: " << warps << std::endl;
std::cout << "Block size: " << block_size << std::endl;
HIPCHECK(hipGetDeviceCount(&num_devices));
if (num_devices < 2) {
std::cout << "Not enough GPUs to run test." << std::endl;
std::cout << "We require at least 2 GPUs, but only found ";
std::cout << num_devices << std::endl;
std::cout << "Skipping the test with PASSED result\n";
passed();
}
uint32_t device_num[num_devices];
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
for (int i = 0; i < num_devices; i++) {
device_num[i] = i;
if (!cooperative_groups_support(device_num[i])) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
}
/*************************************************************************/
/* Test whether the requested size will fit on the GPU *******************/
int warp_sizes[num_devices];
int num_sms[num_devices];
hipDeviceProp_t device_properties[num_devices];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int i = 0; i < num_devices; i++) {
HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i]));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
std::cout << "Device " << (i + 1);
std::cout << " name: " << device_properties[i].name << std::endl;
}
std::cout << std::endl;
int num_threads_in_block = block_size * warp_size;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[num_devices];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < num_devices; i++) {
HIPCHECK(hipSetDevice(device_num[i]));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int requested_blocks = warps / block_size;
if (requested_blocks > max_blocks_per_sm * num_sm) {
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
std::cerr << "but we can only guarantee to simultaneously run ";
std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
failed("");
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
// Each block will output a single value per loop.
uint32_t total_buffer_len = requested_blocks*loops;
// Alocate the buffer that will hold the kernel's output, and which will
// also be used to globally synchronize during GWS initialization
unsigned int *host_buffer[num_devices];
unsigned int *kernel_buffer[num_devices];
unsigned int *kernel_atomic[num_devices];
hipStream_t streams[num_devices];
for (int i = 0; i < num_devices; i++) {
host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
sizeof(unsigned int));
HIPCHECK(hipSetDevice(device_num[i]));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
total_buffer_len * sizeof(unsigned int)));
HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
total_buffer_len * sizeof(unsigned int),
hipMemcpyHostToDevice));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
sizeof(unsigned int)));
HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Single kernel atomic shared between both devices; put it on the host
unsigned int* global_array;
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
num_devices * sizeof(unsigned int), 0));
HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
/*************************************************************************/
/* Launch the kernels ****************************************************/
std::cout << "Launching a kernel with " << warps << " warps ";
std::cout << "in " << requested_blocks << " thread blocks.";
std::cout << std::endl;
void *dev_params[num_devices][4];
hipLaunchParams md_params[num_devices];
for (int i = 0; i < num_devices; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
dev_params[i][3] = reinterpret_cast<void*>(&loops);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].gridDim = requested_blocks;
md_params[i].blockDim = num_threads_in_block;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0));
HIPCHECK(hipDeviceSynchronize());
/*************************************************************************/
/* Read back the buffers and print out its data **************************/
for (int dev = 0; dev < num_devices; dev++) {
HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
total_buffer_len * sizeof(unsigned int),
hipMemcpyDeviceToHost));
}
for (unsigned int i = 0; i < loops; i++) {
for (int dev = 0; dev < num_devices; dev++) {
std::cout << "+++++++++++++++++ Device " << dev;
std::cout << "+++++++++++++++++" << std::endl;
for (unsigned int j = 0; j < requested_blocks; j++) {
std::cout << "Buffer entry " << (i*warps+j);
std::cout << " (written by warp " << j << ")";
std::cout << " is " << host_buffer[dev][i*requested_blocks+j];
std::cout << std::endl;
}
}
std::cout << "==========================\n";
}
for (unsigned int dev = 0; dev < num_devices; dev++) {
std::cout << "Testing output from device " << dev << std::endl;
int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
host_buffer[dev], num_devices);
if (local_ret_val) {
failed("");
}
}
std::cout << std::endl << "The multi-GPU shared updates contain:\n";
for (int i = 0; i < num_devices; i++) {
std::cout << "Entry " << i << ": ";
std::cout << global_array[i] << std::endl;
}
int flag = 0;
for (int dev = 0; dev < num_devices; dev++) {
std::cout << "Testing multi-GPU output for entry " << dev << std::endl;
int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
if (local_ret_val) {
flag = 1;
}
}
for (int k = 0; k < num_devices; ++k) {
HIPCHECK(hipFree(kernel_buffer[k]));
HIPCHECK(hipFree(kernel_atomic[k]));
HIPCHECK(hipStreamDestroy(streams[k]));
free(host_buffer[k]);
}
if (flag == 1) {
failed("");
} else {
passed();
}
}
+233
Просмотреть файл
@@ -0,0 +1,233 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to launch N warps. N is a command-line
parameter, but the user should set N small enough that all warps can be on
the GPU at the same time.
All of the warps do a "work loop". Within the work loop, every warp
atomically increments a global variable. The value returned from this atomic
increment entriely depends on the order the threads arrive at the atomic
instruction. Each warp then stores the result into a global array based on its
warp ID.
We also add a sleep/wait loop into the code so that the last warp runs much
slower than everyone else. As such, it should store much larger values than
all the other warps.
If there are no barrier within the loop, then the last warp will likely get to
the global variable the first time after all the other warps have each
incremented it many times. If the barrier properly works, then each warp
will increment the variable once per time through the loop, and all threads
will sleep on the barrier waiting for the last warp to finally catch up.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
unsigned int *host_buffer) {
unsigned int max_in_this_loop = 0;
for (unsigned int i = 0; i < loops; i++) {
max_in_this_loop += warps;
for (unsigned int j = 0; j < warps; j++) {
if (host_buffer[i*warps+j] > max_in_this_loop) {
std::cerr << "Barrier failure!" << std::endl;
std::cerr << " Buffer entry " << i*warps+j;
std::cerr << " contains the value " << host_buffer[i*warps+j];
std::cerr << " but it should not be more than ";
std::cerr << max_in_this_loop << std::endl;
return -1;
}
}
}
std::cout << "Barriers work properly!" << std::endl;
return 0;
}
__global__ void
test_kernel(unsigned int *atomic_val, unsigned int *array,
unsigned int loops) {
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
unsigned rank = grid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long start_clock = clock64();
while (clock64() < (start_clock+1000000)) {}
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
}
grid.sync();
offset += gridDim.x;
}
}
int main(int argc, char** argv) {
hipError_t err;
int device_num;
uint32_t loops = 2;
uint32_t warps = 10;
uint32_t block_size = 1;
HIPCHECK(hipGetDeviceCount(&device_num));
for (int dev = 0; dev < device_num; ++dev) {
std::cout << "Device number: " << dev << std::endl;
std::cout << "Loops: " << loops << std::endl;
std::cout << "Warps: " << warps << std::endl;
std::cout << "Block size: " << block_size << std::endl;
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
HIPCHECK(hipSetDevice(dev));
if (!cooperative_groups_support(dev)) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
/*************************************************************************/
/* Test whether the requested size will fit on the GPU *******************/
int warp_size;
int num_sms;
int max_blocks_per_sm;
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
warp_size = device_properties.warpSize;
num_sms = device_properties.multiProcessorCount;
std::cout << "Device name: " << device_properties.name << std::endl;
std::cout << std::endl;
int num_threads_in_block = block_size * warp_size;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
test_kernel, num_threads_in_block, 0));
int requested_blocks = warps / block_size;
if (requested_blocks > max_blocks_per_sm * num_sms) {
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
std::cerr << "but we can only guarantee to simultaneously run ";
std::cerr << (max_blocks_per_sm * num_sms) << std::endl;
failed("");
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
// Each block will output a single value per loop.
uint32_t total_buffer_len = requested_blocks*loops;
// Alocate the buffer that will hold the kernel's output, and which will
// also be used to globally synchronize during GWS initialization
unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len,
sizeof(unsigned int));
unsigned int *kernel_buffer;
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer),
total_buffer_len * sizeof(unsigned int)));
HIPCHECK(hipMemcpy(kernel_buffer, host_buffer,
total_buffer_len * sizeof(unsigned int),
hipMemcpyHostToDevice));
unsigned int *kernel_atomic;
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic),
sizeof(unsigned int)));
HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
/*************************************************************************/
/* Launch the kernel *****************************************************/
std::cout << "Launching a kernel with " << warps << " warps ";
std::cout << "in " << requested_blocks << " thread blocks.";
std::cout << std::endl;
void *params[3];
params[0] = reinterpret_cast<void*>(&kernel_atomic);
params[1] = reinterpret_cast<void*>(&kernel_buffer);
params[2] = reinterpret_cast<void*>(&loops);
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
requested_blocks,
num_threads_in_block, params, 0, NULL));
/*************************************************************************/
/* Read back the buffer and print out its data****************************/
HIPCHECK(hipMemcpy(host_buffer, kernel_buffer,
total_buffer_len * sizeof(unsigned int),
hipMemcpyDeviceToHost));
for (unsigned int i = 0; i < loops; i++) {
for (unsigned int j = 0; j < requested_blocks; j++) {
std::cout << "Buffer entry " << (i*warps+j);
std::cout << " (written by warp " << j << ")";
std::cout << " is " << host_buffer[i * requested_blocks + j];
std::cout << std::endl;
}
std::cout << "==========================\n";
}
int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer);
HIPCHECK(hipFree(kernel_buffer));
HIPCHECK(hipFree(kernel_atomic));
if (ret_val == -1) {
failed("");
} else {
passed();
}
}
}
+374
Просмотреть файл
@@ -0,0 +1,374 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to launch N warps to each of two GPUs.
N is a command-line parameter, but the user should set N small enough that all
warps can be on each of the GPUs at the same time.
All of the warps do a "work loop". Within the work loop, every warp
atomically increments a global variable that is shared between both fo the
target GPUs. The value returned from this atomic increment entriely depends
on the order the warps from the GPUs arrive at the atomic instruction. Each
warp then stores the result into a global array based on its warp ID.
We also add a sleep/wait loop into the code so that the last warp runs much
slower than everyone else. As such, it should store much larger values than
all the other warps.
If there are no barrier within the loop, then warp 0 will likely ge to the
global variable the first time while all the other warps have each
incremented it many times. If the barrier properly works, then each warp
will increment the variable once per time through the loop, and all threads
will sleep on the barrier waiting for the last warp to finally catch up.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include "test_common.h"
static int cooperative_groups_support(int device_id) {
hipError_t err;
int cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
hipDeviceAttributeCooperativeLaunch, device_id));
if (!cooperative_attribute) {
std::cerr << "Cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
int multi_gpu_cooperative_attribute;
HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
if (!multi_gpu_cooperative_attribute) {
std::cerr << "Multi-GPU cooperative launch support not available in ";
std::cerr << "the device attribute for device " << device_id;
std::cerr << std::endl;
return 0;
}
hipDeviceProp_t device_properties;
HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
if (device_properties.cooperativeLaunch == 0) {
std::cerr << "Cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
if (device_properties.cooperativeMultiDeviceLaunch == 0) {
std::cerr << "Multi-GPU cooperative group support not available in ";
std::cerr << "device properties." << std::endl;
return 0;
}
return 1;
}
static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
unsigned int *host_buffer,
unsigned int num_devs) {
unsigned int max_in_this_loop = 0;
for (unsigned int i = 0; i < loops; i++) {
max_in_this_loop += (warps * num_devs);
for (unsigned int j = 0; j < warps; j++) {
if (host_buffer[i*warps+j] > max_in_this_loop) {
std::cerr << "Barrier failure!" << std::endl;
std::cerr << " Buffer entry " << i*warps+j;
std::cerr << " contains the value " << host_buffer[i*warps+j];
std::cerr << " but it should not be more than ";
std::cerr << max_in_this_loop << std::endl;
return -1;
}
}
}
std::cout << "\tBarriers work properly!" << std::endl;
return 0;
}
static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
unsigned int desired_val = 0;
for (int i = 0; i < loops; i++) {
if (i % 2 == 0) {
desired_val += 2;
} else {
desired_val *= 2;
}
}
std::cout << "Desired value is " << desired_val << std::endl;
if (array_val != desired_val) {
std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
std::cerr << std::endl;
std::cerr << "Expected the multi-GPUs to work together to produce ";
std::cerr << "the value " << desired_val << std::endl;
std::cerr << "However, the entry returned from the multi-GPU ";
std::cerr << "kernel was " << array_val << std::endl;
return -1;
}
std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
return 0;
}
__global__ void
test_kernel(unsigned int *atomic_val, unsigned int *global_array,
unsigned int *array, uint32_t loops) {
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long start_clock = clock64();
while (clock64() < (start_clock + 1000000)) {}
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long start_clock = clock64();
while (clock64() < (start_clock + 100000000)) {}
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
}
int main(int argc, char** argv) {
hipError_t err;
int device_num = 0, flag = 0;
uint32_t loops = 2;
uint32_t warps = 10;
uint32_t block_size = 1;
HIPCHECK(hipGetDeviceCount(&device_num));
if (device_num < 2) {
std::cout << "This test needs atleast two gpus but found only";
std::cout << device_num << std::endl;
std::cout << "Hence skipping the test with pass result\n";
passed();
}
for (int d = 0; d < (device_num - 1); ++d) {
std::cout << "First device number: " << d << std::endl;
std::cout << "Second device number: " << (d + 1) << std::endl;
std::cout << "Loops: " << loops << std::endl;
std::cout << "Warps: " << warps << std::endl;
std::cout << "Block size: " << block_size << std::endl;
/*************************************************************************/
/* Test whether target device supports cooperative groups ****************/
for (int i = 0; i < 2; i++) {
if (!cooperative_groups_support((d + i))) {
std::cout << "Skipping the test with Pass result.\n";
passed();
}
}
/*************************************************************************/
/* Test whether the requested size will fit on the GPU *******************/
int warp_sizes[2];
int num_sms[2];
hipDeviceProp_t device_properties[2];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i)));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
std::cout << "Device " << (d + i);
std::cout << " name: " << device_properties[i].name << std::endl;
}
std::cout << std::endl;
int num_threads_in_block = block_size * warp_size;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[2];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice((d + i)));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int requested_blocks = warps / block_size;
if (requested_blocks > max_blocks_per_sm * num_sm) {
std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
std::cerr << "but we can only guarantee to simultaneously run ";
std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
failed("");
}
/*************************************************************************/
/* Set up data to pass into the kernel ***********************************/
// Each block will output a single value per loop.
uint32_t total_buffer_len = requested_blocks*loops;
// Alocate the buffer that will hold the kernel's output, and which will
// also be used to globally synchronize during GWS initialization
unsigned int *host_buffer[2];
unsigned int *kernel_buffer[2];
unsigned int *kernel_atomic[2];
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
sizeof(unsigned int));
HIPCHECK(hipSetDevice((d + i)));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
total_buffer_len * sizeof(unsigned int)));
HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice));
HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
sizeof(unsigned int)));
HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Single kernel atomic shared between both devices; put it on the host
unsigned int* global_array;
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
2 * sizeof(unsigned int), 0));
HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int)));
/*************************************************************************/
/* Launch the kernels ****************************************************/
std::cout << "Launching a kernel with " << warps << " warps ";
std::cout << "in " << requested_blocks << " thread blocks.";
std::cout << std::endl;
void *dev_params[2][4];
hipLaunchParams md_params[2];
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
dev_params[i][3] = reinterpret_cast<void*>(&loops);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].gridDim = requested_blocks;
md_params[i].blockDim = num_threads_in_block;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIPCHECK(hipDeviceSynchronize());
/*************************************************************************/
/* Read back the buffers and print out its data **************************/
for (int dev = 0; dev < 2; dev++) {
HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev],
total_buffer_len * sizeof(unsigned int),
hipMemcpyDeviceToHost));
}
for (unsigned int i = 0; i < loops; i++) {
for (int dev = 0; dev < 2; dev++) {
std::cout << "+++++++++++++++++ Device " << (d + dev);
std::cout << "+++++++++++++++++" << std::endl;
for (unsigned int j = 0; j < requested_blocks; j++) {
std::cout << "Buffer entry " << (i * warps + j);
std::cout << " (written by warp " << j << ")";
std::cout << " is " << host_buffer[dev][i * requested_blocks + j];
std::cout << std::endl;
}
}
std::cout << "==========================\n";
}
for (unsigned int dev = 0; dev < 2; dev++) {
std::cout << "Testing output from device " << (d + dev) << std::endl;
int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
host_buffer[dev], 2);
if (local_ret_val == -1) {
flag = 1;
}
}
std::cout << std::endl << "The multi-GPU shared updates contain:";
std::cout << std::endl;
for (int i = 0; i < 2; i++) {
std::cout << "Entry " << i << ": ";
std::cout << global_array[i] << std::endl;
}
for (int dev = 0; dev < 2; dev++) {
std::cout << "Testing multi-GPU output for entry " << (d + dev);
std::cout << std::endl;
int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
if (local_ret_val) {
flag = 1;
}
}
for (int k = 0; k < 2; ++k) {
HIPCHECK(hipFree(kernel_buffer[k]));
HIPCHECK(hipFree(kernel_atomic[k]));
HIPCHECK(hipStreamDestroy(streams[k]));
free(host_buffer[k]);
}
}
if (flag == 1) {
failed("");
} else {
passed();
}
}
+173 -173
Просмотреть файл
@@ -1,173 +1,173 @@
/*
* Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* Test to compare
* 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
* 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
* HIT_END
*/
#include "test_common.h"
#define MAX_DEVICE_LENGTH 20
static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
for (int i = 0; i < deviceCount; i++) {
HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
}
return true;
}
bool comparePciBusIDWithHipDeviceGetAttribute() {
bool testResult = true;
int deviceCount = 0;
HIPCHECK(hipGetDeviceCount(&deviceCount));
HIPASSERT(deviceCount != 0);
printf("No.of gpus in the system: %d\n", deviceCount);
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
getPciBusId(deviceCount, hipDeviceList);
for (int i = 0; i < deviceCount; i++) {
int pciBusID = -1;
int pciDeviceID = -1;
int pciDomainID = -1;
int tempPciBusId = -1;
sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
&pciDeviceID);
HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
if (pciBusID != tempPciBusId) {
testResult = false;
printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
"hipDeviceGetAttribute for gpu %d\n", i);
}
}
printf("pciBusID output of both hipDeviceGetPCIBusId and"
" hipDeviceGetAttribute matched for all gpus\n");
return testResult;
}
bool compareHipDeviceGetPCIBusIdWithLspci() {
FILE *fpipe;
bool testResult = false;
{
// Check if lspci is installed, if not, don't proceed
char const *cmd = "lspci --version";
char *lspciCheck;
char temp[20];
fpipe = popen(cmd, "r");
if (fpipe == nullptr) {
printf("Unable to create command file\n");
return testResult;
}
lspciCheck = fgets(temp, 20, fpipe);
pclose(fpipe);
if (!lspciCheck) {
printf("lspci not found. Skipping the test\n");
return true;
}
}
int deviceCount = 0;
HIPCHECK(hipGetDeviceCount(&deviceCount));
HIPASSERT(deviceCount != 0);
printf("No.of gpus in the system: %d\n", deviceCount);
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
getPciBusId(deviceCount, hipDeviceList);
// Get lspci device list and compare with hip device list
#if defined(__CUDA_ARCH__)
char const *command = "lspci -D | grep controller | grep NVIDIA | "
"cut -d ' ' -f 1";
#else
char const *command = "lspci -D | grep controller | grep AMD/ATI | "
"cut -d ' ' -f 1";
#endif
fpipe = popen(command, "r");
if (fpipe == nullptr) {
printf("Unable to create command file\n");
return testResult;
}
int index = 0;
int deviceMatchCount = 0;
while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
bool bMatchFound = false;
for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
deviceMatchCount++;
bMatchFound = true;
}
}
if (bMatchFound == false) {
printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
}
index++;
}
pclose(fpipe);
if (deviceMatchCount == deviceCount) {
printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
"matched for all gpus\n");
testResult = true;
} else {
printf("Mismatch in number GPUs reported by HIP with lscpi\n");
}
return testResult;
}
int main(int argc, char* argv[]) {
bool testResult = true;
HipTest::parseStandardArguments(argc, argv, true);
if (p_tests & 0x1) {
testResult &= comparePciBusIDWithHipDeviceGetAttribute();
}
if (p_tests & 0x2) {
#ifdef __unix__
testResult &= compareHipDeviceGetPCIBusIdWithLspci();
#else
printf("Detected non-linux OS. Skipping the test\n");
#endif
}
if (testResult) {
passed();
} else {
failed("one or more tests failed\n");
}
}
/*
* Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* Test to compare
* 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
* 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
* TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
* HIT_END
*/
#include "test_common.h"
#define MAX_DEVICE_LENGTH 20
static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
for (int i = 0; i < deviceCount; i++) {
HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
}
return true;
}
bool comparePciBusIDWithHipDeviceGetAttribute() {
bool testResult = true;
int deviceCount = 0;
HIPCHECK(hipGetDeviceCount(&deviceCount));
HIPASSERT(deviceCount != 0);
printf("No.of gpus in the system: %d\n", deviceCount);
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
getPciBusId(deviceCount, hipDeviceList);
for (int i = 0; i < deviceCount; i++) {
int pciBusID = -1;
int pciDeviceID = -1;
int pciDomainID = -1;
int tempPciBusId = -1;
sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
&pciDeviceID);
HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
if (pciBusID != tempPciBusId) {
testResult = false;
printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
"hipDeviceGetAttribute for gpu %d\n", i);
}
}
printf("pciBusID output of both hipDeviceGetPCIBusId and"
" hipDeviceGetAttribute matched for all gpus\n");
return testResult;
}
bool compareHipDeviceGetPCIBusIdWithLspci() {
FILE *fpipe;
bool testResult = false;
{
// Check if lspci is installed, if not, don't proceed
char const *cmd = "lspci --version";
char *lspciCheck;
char temp[20];
fpipe = popen(cmd, "r");
if (fpipe == nullptr) {
printf("Unable to create command file\n");
return testResult;
}
lspciCheck = fgets(temp, 20, fpipe);
pclose(fpipe);
if (!lspciCheck) {
printf("lspci not found. Skipping the test\n");
return true;
}
}
int deviceCount = 0;
HIPCHECK(hipGetDeviceCount(&deviceCount));
HIPASSERT(deviceCount != 0);
printf("No.of gpus in the system: %d\n", deviceCount);
char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
getPciBusId(deviceCount, hipDeviceList);
// Get lspci device list and compare with hip device list
#if defined(__CUDA_ARCH__)
char const *command = "lspci -D | grep controller | grep NVIDIA | "
"cut -d ' ' -f 1";
#else
char const *command = "lspci -D | grep controller | grep AMD/ATI | "
"cut -d ' ' -f 1";
#endif
fpipe = popen(command, "r");
if (fpipe == nullptr) {
printf("Unable to create command file\n");
return testResult;
}
int index = 0;
int deviceMatchCount = 0;
while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
bool bMatchFound = false;
for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
deviceMatchCount++;
bMatchFound = true;
}
}
if (bMatchFound == false) {
printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
}
index++;
}
pclose(fpipe);
if (deviceMatchCount == deviceCount) {
printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
"matched for all gpus\n");
testResult = true;
} else {
printf("Mismatch in number GPUs reported by HIP with lscpi\n");
}
return testResult;
}
int main(int argc, char* argv[]) {
bool testResult = true;
HipTest::parseStandardArguments(argc, argv, true);
if (p_tests & 0x1) {
testResult &= comparePciBusIDWithHipDeviceGetAttribute();
}
if (p_tests & 0x2) {
#ifdef __unix__
testResult &= compareHipDeviceGetPCIBusIdWithLspci();
#else
printf("Detected non-linux OS. Skipping the test\n");
#endif
}
if (testResult) {
passed();
} else {
failed("one or more tests failed\n");
}
}
+1 -1
Просмотреть файл
@@ -25,7 +25,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST_NAMED: %t hipSetGetDevice-invalidDevice
* TEST_NAMED: %t hipSetGetDevice-allValidDevice
* TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1
+227
Просмотреть файл
@@ -0,0 +1,227 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <semaphore.h>
#include <unistd.h>
#include "test_common.h"
#ifdef __linux__
sem_t *sem_ob1 = NULL, *sem_ob2 = NULL;
typedef struct mem_handle {
int device;
hipIpcMemHandle_t memHandle;
bool IfTestPassed;
} hip_ipc_t;
class IpcMemHandleTest {
public:
bool InitFlag = true;
hip_ipc_t *shrd_mem = NULL;
pid_t pid;
size_t N = 1024;
size_t Nbytes = N * sizeof(int);
int *A_d = NULL, out = 0;
int *A_h, *C_h;
int Num_devices = 0, Data_mismatch, CanAccessPeer = 0;
int *Ad1 = NULL, *Ad2 = NULL;
IpcMemHandleTest();
bool Test();
~IpcMemHandleTest();
};
bool IpcMemHandleTest::Test() {
if (InitFlag == false) {
// Abort the test if the initialization fails
printf("Resource initialization failed. Hence test skipped!");
return false;
}
pid = fork();
if (pid != 0) {
// Parent process
HIPCHECK(hipGetDeviceCount(&Num_devices));
for (int i = 0; i < Num_devices; ++i) {
if (shrd_mem->IfTestPassed == true) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&A_d, Nbytes));
HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle,
A_d));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
shrd_mem->device = i;
if ((out=sem_post(sem_ob1)) == -1) {
// Need to use inline function to release resources.
shrd_mem->IfTestPassed = false;
failed("sem_post() call failed in parent process.");
}
if ((out=sem_wait(sem_ob2)) == -1) {
shrd_mem->IfTestPassed = false;
failed("sem_wait() call failed in parent process.");
}
HIPCHECK(hipFree(A_d));
}
}
} else {
// Child process
HIPCHECK(hipGetDeviceCount(&Num_devices));
for (int j = 0; j < Num_devices; ++j) {
if ((out=sem_wait(sem_ob1)) == -1) {
shrd_mem->IfTestPassed = false;
printf("sem_wait() call failed in child process.");
if ((out=sem_post(sem_ob2)) == -1) {
printf("sem_post() call on sem_ob2 failed");
exit(1);
}
}
for (int i = 0; i < Num_devices; ++i) {
Data_mismatch = 0;
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&Ad2, Nbytes));
HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle,
hipIpcMemLazyEnablePeerAccess));
HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device));
if (CanAccessPeer == 1) {
HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice));
HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice));
for (int i = 0; i < N; ++i) {
if (C_h[i] != 123)
Data_mismatch++;
}
if (Data_mismatch != 0) {
printf("Data mismatch found when data copied from Ipc memhandle");
printf(" to Device: %d\n", i);
shrd_mem->IfTestPassed = false;
}
memset(reinterpret_cast<void*>(C_h), 0, Nbytes);
// Checking if the data obtained from Ipc shared memory is consistent
HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost));
for (int i = 0; i < N; ++i) {
if (C_h[i] != 123)
Data_mismatch++;
}
if (Data_mismatch != 0) {
printf("Data mismatch found when data copied from Ipc memhandle");
printf(" Host.\n");
shrd_mem->IfTestPassed = false;
}
}
HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast<void*>(Ad1)));
}
HIPCHECK(hipFree(Ad2));
if ((out=sem_post(sem_ob2)) == -1) {
shrd_mem->IfTestPassed = false;
printf("sem_post() call on sem_ob2 failed");
exit(1);
}
}
exit(0);
}
if ((out = sem_unlink("/my-sem-object1")) == -1) {
printf("sem_unlink() call on /my-sem-object1 failed");
}
if ((out = sem_unlink("/my-sem-object2")) == -1) {
printf("sem_unlink() call on /my-sem-object2 failed");
}
int status;
waitpid(pid, &status, 0);
if (shrd_mem->IfTestPassed == false) {
return false;
} else {
return true;
}
}
IpcMemHandleTest::IpcMemHandleTest() {
std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*";
int res = system(cmd_line.c_str());
if (res == -1) {
InitFlag = false;
printf("System call to remove existing shared objects failed!");
}
int out;
if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) ==
SEM_FAILED) {
InitFlag = false;
printf("Initialization of 1st semaphore object failed");
}
if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) ==
SEM_FAILED) {
InitFlag = false;
printf("Initialization of 2nd semaphore object failed");
}
shrd_mem = reinterpret_cast<hip_ipc_t *>(mmap(NULL, sizeof(hip_ipc_t),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0, 0));
if (shrd_mem == NULL) {
InitFlag = false;
printf("mmap() call failed!");
}
shrd_mem->IfTestPassed = true;
A_h = reinterpret_cast<int*>(malloc(Nbytes));
C_h = reinterpret_cast<int*>(malloc(Nbytes));
for (size_t i = 0; i < N; i++) {
A_h[i] = 123;
}
}
IpcMemHandleTest::~IpcMemHandleTest() {
munmap(shrd_mem, sizeof(hip_ipc_t));
HIPCHECK(hipFree((A_d)));
free(A_h);
free(C_h);
HIPCHECK(hipFree((Ad1)));
HIPCHECK(hipFree((Ad2)));
}
#endif
int main() {
bool IfTestPassed = true;
// The following program spawns a child process and does the following
// Parent iterate through each device, create memory -- create hipIpcMemhandle
// stores the mem handle in mmaped memory, release the child using sem_post()
// and wait for child to release itself(parent process)
// child process:
// Child process get the ipc mem handle using hipIpcOpenMemHandle
// Iterate through all the available gpus and do Device to Device copies
// and check for data consistencies and close the hipIpcCloseMemHandle
// release the parent and wait for parent to release itself(child)
#ifdef __linux__
IpcMemHandleTest obj;
IfTestPassed = obj.Test();
#else
printf("This is not a Linux platform. Hence Skipping the test!\n");
IfTestPassed = true;
#endif
if (IfTestPassed == false) {
failed("");
}
passed();
}
+487
Просмотреть файл
@@ -0,0 +1,487 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
Testcase Scenarios :
(TestCase 1)::
1) Test hipMalloc() api passing zero size and confirming *ptr returning
nullptr. Also pass nullptr to hipFree() api.
2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate
error is returned.
3) Check for hipMalloc() error code, passing invalid/null pointer.
(TestCase 2)::
4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation
with adequate number of iterations and later test for kernel execution on
default gpu.
5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks
keeping maximum number of iterations and then run kernel code on default
gpu, perfom data validation.
(TestCase 3)::
6) Check hipMalloc() api adaptability when app creates small chunks of memory
continuously, stores it for later use and then frees it at later point
of time.
(TestCase 4)::
7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child
processes, validate the results.
(TestCase 5)::
8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple
child processes. Validate buffers allocated after running kernel code.
(TestCase 6)::
9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from
multiple threads and regress the api.
(TestCases 2, 3, 4, 5, 6)::
10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc()
api. Check for any possible memory leaks.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* TEST_NAMED: %t hipMalloc_ArgValidation --tests 1
* TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2
* TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3
* TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4
* TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5
* TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6
* HIT_END
*/
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <iostream>
#include <vector>
#include <limits>
#include <atomic>
#include "test_common.h"
/* Max alloc/free iterations for bigger chunks */
#define MAX_ALLOCFREE_BC (10000)
/* Buffer size for alloc/free cycles */
#define BUFF_SIZE_AF (5*1024*1024)
/* Max alloc/free iterations for smaller chunks */
#define MAX_ALLOCFREE_SC (5000000)
/* Max alloc and pool iterations (TBD) */
#define MAX_ALLOCPOOL_ITER (2000000)
/**
* Validates data consitency on supplied gpu
*/
bool validateMemoryOnGPU(int gpu) {
size_t Nbytes = N * sizeof(int);
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
size_t prevAvl, prevTot, curAvl, curTot;
bool TestPassed = true;
HIPCHECK(hipSetDevice(gpu));
HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot));
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
0, 0, static_cast<const int*>(A_d),
static_cast<const int*>(B_d), C_d, N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
} else {
printf("%s : Validation FAILED for gpu %d from pid %d\n",
__func__, gpu, getpid());
TestPassed &= false;
}
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipMemGetInfo(&curAvl, &curTot));
if ((prevAvl != curAvl) || (prevTot != curTot)) {
printf("%s : Memory allocation mismatch observed."
"Possible memory leak.", __func__);
TestPassed &= false;
}
return TestPassed;
}
/**
* Fetches Gpu device count
*/
void getDeviceCount(int *pdevCnt) {
#ifdef __linux__
int fd[2], val = 0;
pid_t childpid;
// create pipe descriptors
pipe(fd);
// disable visible_devices env from shell
unsetenv("ROCR_VISIBLE_DEVICES");
unsetenv("HIP_VISIBLE_DEVICES");
childpid = fork();
if (childpid > 0) { // Parent
close(fd[1]);
// parent will wait to read the device cnt
read(fd[0], &val, sizeof(val));
// close the read-descriptor
close(fd[0]);
// wait for child exit
wait(NULL);
*pdevCnt = val;
} else if (!childpid) { // Child
int devCnt = 1;
// writing only, no need for read-descriptor
close(fd[0]);
HIPCHECK(hipGetDeviceCount(&devCnt));
// send the value on the write-descriptor:
write(fd[1], &devCnt, sizeof(devCnt));
// close the write descriptor:
close(fd[1]);
exit(0);
} else { // failure
*pdevCnt = 1;
return;
}
#else
HIPCHECK(hipGetDeviceCount(pdevCnt));
#endif
}
/**
* Regress memory allocation and free in loop
*/
bool regressAllocInLoop(int gpu) {
bool TestPassed = true;
size_t tot, avail, ptot, pavail;
int i = 0;
int *ptr;
HIPCHECK(hipSetDevice(gpu));
// Exercise allocation in loop with bigger chunks
for (i = 0; i < MAX_ALLOCFREE_BC; i++) {
size_t numBytes = BUFF_SIZE_AF;
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
HIPCHECK(hipMalloc(&ptr, numBytes));
HIPCHECK(hipMemGetInfo(&avail, &tot));
if (pavail-avail != numBytes) {
printf("LoopAllocation : Memory allocation of %6.2fMB"
"not matching with hipMemGetInfo - FAIL\n",
numBytes/(1024.0*1024.0));
TestPassed &= false;
HIPCHECK(hipFree(ptr));
break;
}
HIPCHECK(hipFree(ptr));
}
// Exercise allocation in loop with smaller chunks and max iters
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
for (i = 0; i < MAX_ALLOCFREE_SC; i++) {
size_t numBytes = 16;
HIPCHECK(hipMalloc(&ptr, numBytes));
HIPCHECK(hipFree(ptr));
}
HIPCHECK(hipMemGetInfo(&avail, &tot));
if ((pavail != avail) || (ptot != tot)) {
printf("LoopAllocation : Memory allocation mismatch observed."
"Possible memory leak.");
TestPassed &= false;
}
return TestPassed;
}
/*
* Thread func to regress alloc and check data consistency
*/
std::atomic<bool> g_thTestPassed(true);
void threadFunc(int gpu) {
g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu);
g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu);
printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load());
}
int main(int argc, char* argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
if (p_tests == 1) { // Arg validation
// Test hipMalloc for zero size
bool TestPassed = true;
int *ptr;
HIPCHECK(hipMalloc(&ptr, 0));
// ptr expected to be reset to null ptr
if (ptr) {
printf("ArgValidation : Failed in zero size test\n");
TestPassed &= false;
}
// Free null ptr
HIPCHECK(hipFree(ptr));
// Test hipMalloc for invalid arguments
hipError_t ret;
if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) {
printf("ArgValidation : Inappropritate error value returned"
" for invalid argument. Error: '%s'(%d)\n",
hipGetErrorString(ret), ret);
TestPassed &= false;
}
// Test hipMalloc for Maximum value of size_t
if ((ret = hipMalloc(&ptr, std::numeric_limits<std::size_t>::max()))
!= hipErrorMemoryAllocation) {
printf("ArgValidation : Invalid error returned for max size_t."
" Error: '%s'(%d)\n", hipGetErrorString(ret), ret);
TestPassed &= false;
}
if (TestPassed) {
passed();
} else {
failed("hipMalloc ArgumentValidation Failure!");
}
} else if (p_tests == 2) { // Loop Regression Alloc/Free Cycle
bool TestPassed = true;
TestPassed &= regressAllocInLoop(0);
TestPassed &= validateMemoryOnGPU(0);
if (TestPassed) {
passed();
} else {
failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!");
}
} else if (p_tests == 3) { // Loop Regression Alloc and Pool
size_t avail, tot, pavail, ptot;
bool TestPassed = true;
hipError_t err;
int *ptr;
std::vector<int *> ptrlist;
HIPCHECK(hipMemGetInfo(&pavail, &ptot));
// Allocate small chunks of memory million times
for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) { // Iterations TBD
if ((err = hipMalloc(&ptr, 10)) != hipSuccess) {
HIPCHECK(hipMemGetInfo(&avail, &tot));
printf("Loop regression pool allocation failure. "
"Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n",
tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err));
TestPassed &= false;
break;
}
// Store pointers allocated to emulate memory pool of app
ptrlist.push_back(ptr);
}
// Free ptrs at later point of time
for ( auto &t : ptrlist ) {
HIPCHECK(hipFree(t));
}
HIPCHECK(hipMemGetInfo(&avail, &tot));
TestPassed &= validateMemoryOnGPU(0);
if ((pavail != avail) || (ptot != tot)) {
printf("%s : Memory allocation mismatch observed. Possible memory leak.",
__func__);
TestPassed &= false;
}
if (TestPassed) {
passed();
} else {
failed("hipMalloc_LoopRegression_AllocPool failure!");
}
} else if (p_tests == 4) {
bool TestPassed = true;
#ifdef __linux__
// Parallel execution of parent and child on gpu0
int pid;
if ((pid = fork()) < 0) {
printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid);
TestPassed &= false;
} else if (!pid) { // Child process
bool TestPassedChild = true;
TestPassedChild = validateMemoryOnGPU(0);
if (TestPassedChild) {
exit(0); // child exit with success status
} else {
printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid());
exit(1); // child exit with failure status
}
} else { // Parent process
int exitStatus;
TestPassed = validateMemoryOnGPU(0);
pid = wait(&exitStatus);
if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) )
TestPassed &= false;
}
#else
printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n");
#endif
// TC scenarios specific to linux
// are treated as pass in windows.
if (TestPassed) {
passed();
} else {
failed("hipMallocChild_Concurrency_DefaultGpu Failed!");
}
} else if (p_tests == 5) {
bool TestPassed = true;
#ifdef __linux__
// Parallel execution on multiple gpus from different child processes
int devCnt = 1, pid = 0, cumStatus = 0;
// Get GPU count
getDeviceCount(&devCnt);
// Spawn child for each GPU
for (int gpu = 0; gpu < devCnt; gpu++) {
if ((pid = fork()) < 0) {
printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid);
failed("Test Failed!");
} else if (!pid) { // Child process
bool TestPassedChild = true;
TestPassedChild = validateMemoryOnGPU(gpu);
if (TestPassedChild) {
exit(0); // child exit with success status
} else {
printf("Child_Concurrency_MultiGpu : childpid %d failed\n",
getpid());
exit(1); // child exit with failure status
}
}
}
// Parent shall wait for child to complete
for (int i = 0; i < devCnt; i++) {
int pidwait = 0, exitStatus;
pidwait = wait(&exitStatus);
if (pidwait < 0) {
TestPassed &= false;
break;
}
cumStatus |= WEXITSTATUS(exitStatus);
}
// Cummulative status of all child
if (cumStatus) {
TestPassed &= false;
}
#else
printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n");
#endif
// TC scenarios specific to linux
// are treated as pass in windows.
if (TestPassed) {
passed();
} else {
failed("hipMallocChild_Concurrency_MultiGpu Failed!");
}
} else if (p_tests == 6) { // Multithreaded multiple gpu execution
std::vector<std::thread> threadlist;
int devCnt = 1;
// Get GPU count
getDeviceCount(&devCnt);
for (int i = 0; i < devCnt; i++) {
threadlist.push_back(std::thread(threadFunc, i));
}
for (auto &t : threadlist) {
t.join();
}
if (g_thTestPassed) {
passed();
} else {
failed("hipMalloc_MultiThreaded_MultiGpu Failed!");
}
} else {
failed("Didnt receive any valid option. Try options 1 to 6\n");
}
}
+423
Просмотреть файл
@@ -0,0 +1,423 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* Test 6 is disabled */
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* TEST_NAMED: %t hipMallocManaged1 --tests 1
* TEST_NAMED: %t hipMallocManaged2 --tests 2
* TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3
* TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4
* TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc
* TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc
* HIT_END
*/
#include <atomic>
#include "test_common.h"
#define N 1048576 // equals to (1024*1024)
#define INIT_VAL 123
/*
* Kernel function to perform addition operation.
*/
template <typename T>
__global__ void
vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
Ad2[i] = Ad1[i] + Ad1[i];
}
}
// The following Test case tests the following scenario:
// A large chunk of hipMallocManaged() memory(Hmm) is created
// Equal parts of Hmm is accessed on available gpus and
// kernel is launched on acessed chunk of hmm memory
// and checks if there are any inconsistencies or access issues
bool MultiChunkMultiDevice(int NumDevices) {
std::atomic<int> DataMismatch{0};
bool IfTestPassed = true;
int Counter = 0;
unsigned int NUM_ELMS = (1024 * 1024);
float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS];
hipStream_t stream[NumDevices];
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
HIPCHECK(hipSetDevice(Oloop));
HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float)));
HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float)));
HIPCHECK(hipStreamCreate(&stream[Oloop]));
}
HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float))));
for (int i = 0; i < NumDevices; ++i) {
for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
Hmm[Counter] = INIT_VAL + i;
}
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (NUM_ELMS + 255)/256;
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
}
HIPCHECK(hipDeviceSynchronize());
for (int m = 0; m < NumDevices; ++m) {
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
for (int n = 0; n < NUM_ELMS; ++n) {
if (Ah[n] != ((INIT_VAL + m) * 2)) {
DataMismatch++;
}
}
memset(reinterpret_cast<void*>(Ah), 0, NUM_ELMS * sizeof(float));
}
if (DataMismatch.load() != 0) {
printf("MultiChunkMultiDevice: Mismatch observed!\n");
IfTestPassed = false;
}
for (int i = 0; i < NumDevices; ++i) {
HIPCHECK(hipFree(Ad[i]));
HIPCHECK(hipStreamDestroy(stream[i]));
}
HIPCHECK(hipFree(Hmm));
free(Ah);
return IfTestPassed;
}
// The following Test case tests the following scenario:
// A large chunk of hipMallocManaged() memory(Hmm) is created
// Equal parts of Hmm is accessed and
// kernel is launched on acessed chunk of hmm memory
// and checks if there are any inconsistencies or access issues
bool MultiChunkSingleDevice(int NumDevices) {
std::atomic<int> DataMismatch{0};
int Chunks = 4, Counter = 0;
bool IfTestPassed = true;
unsigned int NUM_ELMS = (1024 * 1024);
float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS];
hipStream_t stream[Chunks];
for (int i = 0; i < Chunks; ++i) {
HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float)));
HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float)));
HIPCHECK(hipStreamCreate(&stream[i]));
}
HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float))));
for (int i = 0; i < Chunks; ++i) {
for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
Hmm[Counter] = (INIT_VAL + i);
}
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (NUM_ELMS + 255)/256;
for (int k = 0; k < Chunks; ++k) {
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
}
HIPCHECK(hipDeviceSynchronize());
for (int m = 0; m < Chunks; ++m) {
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
for (int n = 0; n < NUM_ELMS; ++n) {
if (Ah[n] != ((INIT_VAL + m) * 2)) {
DataMismatch++;
}
}
}
if (DataMismatch.load() != 0) {
printf("MultiChunkSingleDevice: Mismatch observed!\n");
IfTestPassed = false;
}
for (int i = 0; i < Chunks; ++i) {
HIPCHECK(hipFree(Ad[i]));
HIPCHECK(hipStreamDestroy(stream[i]));
}
HIPCHECK(hipFree(Hmm));
free(Ah);
return IfTestPassed;
}
// The following tests oversubscription hipMallocManaged() api
// Currently disabled.
bool TestOversubscriptionMallocManaged(int NumDevices) {
bool IfTestPassed = true;
hipError_t err;
void *A = NULL;
size_t total = 0, free = 0;
HIPCHECK(hipMemGetInfo(&free, &total));
// ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook
// into how out of memory can be tested.
// Demanding more mem size than available
err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal);
if (hipErrorOutOfMemory != err) {
printf("hipMallocManaged: Returned %s for size value > device memory\n",
hipGetErrorString(err));
IfTestPassed = false;
}
return IfTestPassed;
}
// The following test does negative testing of hipMallocManaged() api
// by passing invalid values and check if the behavior is as expected
bool NegativeTestsMallocManaged(int NumDevices) {
bool IfTestPassed = true;
hipError_t err;
void *A = NULL;
size_t total = 0, free = 0;
HIPCHECK(hipMemGetInfo(&free, &total));
err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s when devPtr is null\n",
hipGetErrorString(err));
IfTestPassed = false;
}
err = hipMallocManaged(&A, 0, hipMemAttachGlobal);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s when size is 0\n",
hipGetErrorString(err));
IfTestPassed = false;
}
err = hipMallocManaged(NULL, 0, hipMemAttachGlobal);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n",
hipGetErrorString(err));
IfTestPassed = false;
}
#ifdef __HIP_PLATFORM_HCC__
// The flag hipMemAttachHost is currently not supported therefore
// api should return "hipErrorInvalidValue" for now
err = hipMallocManaged(&A, 1024, hipMemAttachHost);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n",
hipGetErrorString(err));
IfTestPassed = false;
}
#endif // __HIP_PLATFORM_HCC__
err = hipMallocManaged(NULL, 0, 0);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s when params are null, 0, 0\n",
hipGetErrorString(err));
IfTestPassed = false;
}
err = hipMallocManaged(&A, 1024, 145);
if (hipErrorInvalidValue != err) {
printf("hipMallocManaged: Returned %s when flag param is numerical 145\n",
hipGetErrorString(err));
IfTestPassed = false;
}
err = hipMallocManaged(&A, -10, hipMemAttachGlobal);
if (hipErrorOutOfMemory != err) {
printf("hipMallocManaged: Returned %s for negative size value.\n",
hipGetErrorString(err));
IfTestPassed = false;
}
return IfTestPassed;
}
// Allocate two pointers using hipMallocManaged(), initialize,
// then launch kernel using these pointers directly and
// later validate the content without using any Memcpy.
template <typename T>
bool TestMallocManaged2(int NumDevices) {
bool IfTestPassed = true;
T *Hmm1 = NULL, *Hmm2 = NULL;
for (int i = 0; i < NumDevices; ++i) {
HIPCHECK(hipSetDevice(i));
std::atomic<int> DataMismatch{0};
HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T)));
HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T)));
for (int m = 0; m < N; ++m) {
Hmm1[m] = m;
Hmm2[m] = 0;
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (N + 255)/256;
// Kernel launch
vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
HIPCHECK(hipDeviceSynchronize());
for (int v = 0; v < N; ++v) {
if (Hmm2[v] != (v + v)) {
DataMismatch++;
}
}
if (DataMismatch.load() != 0) {
IfTestPassed = false;
}
HIPCHECK(hipFree(Hmm1));
HIPCHECK(hipFree(Hmm2));
}
return IfTestPassed;
}
// In the following test, a memory is created using hipMallocManaged() by
// setting a device and verified if it is accessible when the context is set
// to all other devices. This include verification and Device two Device
// transfers and kernel launch o discover if there any access issues.
template <typename T>
bool TestMallocManaged1(int NumDevices) {
std::atomic<unsigned int> DataMismatch;
bool TestPassed = true;
T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL;
for (int i =0; i < N; ++i) {
Ah1[i] = INIT_VAL;
Ah2[i] = 0;
}
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
DataMismatch = 0;
HIPCHECK(hipSetDevice(Oloop));
HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
for (int Iloop = 0; Iloop < NumDevices; ++Iloop) {
HIPCHECK(hipSetDevice(Iloop));
HIPCHECK(hipMalloc(&Ad, N * sizeof(T)));
// Copy data from host to hipMallocMananged memory and verify
HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice));
for (int v = 0; v < N; ++v) {
if (Hmm[v] != INIT_VAL) {
DataMismatch++;
}
}
if (DataMismatch.load() != 0) {
printf("Mismatch is observed with host data at device %d", Iloop);
printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
TestPassed = false;
DataMismatch = 0;
}
// Executing D2D transfer with hipMallocManaged memory and verify
HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice));
HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
for (int k = 0; k < N; ++k) {
if (Ah2[k] != INIT_VAL) {
DataMismatch++;
}
}
if (DataMismatch.load() != 0) {
printf("Mismatch is observed with D2D transfer at device %d\n", Iloop);
printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
TestPassed = false;
DataMismatch = 0;
}
HIPCHECK(hipMemset(Ad, 0, N * sizeof(T)));
const unsigned threadsPerBlock = 256;
const unsigned blocks = (N + 255)/256;
// Launching the kernel to check if there is any access issue with
// hipMallocManaged memory and local device's memory
vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
hipDeviceSynchronize();
HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
for (int m = 0; m < N; ++m) {
if (Ah2[m] != 246) {
DataMismatch++;
}
}
if (DataMismatch.load() != 0) {
printf("Data Mismatch observed after kernel lch device %d\n", Iloop);
TestPassed = false;
DataMismatch = 0;
}
HIPCHECK(hipFree(Ad));
}
HIPCHECK(hipFree(Hmm));
}
free(Ah1);
free(Ah2);
return TestPassed;
}
int main(int argc, char* argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
if ((p_tests <= 0) || (p_tests > 5)) {
failed("Valid arguments are from 1 to 5");
}
int NumDevices = 0;
HIPCHECK(hipGetDeviceCount(&NumDevices));
bool TestStatus = true, OverAllStatus = true;
if (p_tests == 1) {
TestStatus = TestMallocManaged1<float>(NumDevices);
if (!TestStatus) {
printf("Test Failed with float datatype.\n");
OverAllStatus = false;
}
TestStatus = TestMallocManaged1<int>(NumDevices);
if (!TestStatus) {
printf("Test Failed with int datatype.\n");
OverAllStatus = false;
}
TestStatus = TestMallocManaged1<unsigned char>(NumDevices);
if (!TestStatus) {
printf("Test Failed with unsigned char datatype.\n");
OverAllStatus = false;
}
TestStatus = TestMallocManaged1<double>(NumDevices);
if (!TestStatus) {
printf("Test Failed with double datatype.\n");
OverAllStatus = false;
}
if (!OverAllStatus) {
failed("");
}
}
if (p_tests == 2) {
TestStatus = TestMallocManaged2<float>(NumDevices);
if (!TestStatus) {
failed("Test Failed with float datatype.");
}
}
if (p_tests == 3) {
TestStatus = NegativeTestsMallocManaged(NumDevices);
if (!TestStatus) {
failed("Negative Tests with hipMallocManaged() failed!.");
}
}
if (p_tests == 4) {
TestStatus = MultiChunkSingleDevice(NumDevices);
if (!TestStatus) {
failed("hipMallocManaged: MultiChunkSingleDevice test failed!");
}
}
if (p_tests == 5) {
TestStatus = MultiChunkMultiDevice(NumDevices);
if (!TestStatus) {
failed("hipMallocManaged: MultiChunkMultiDevice test failed!");
}
}
if (p_tests == 6) {
TestStatus = TestOversubscriptionMallocManaged(NumDevices);
if (!TestStatus) {
failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!");
}
}
passed();
}
+3
Просмотреть файл
@@ -75,6 +75,9 @@ int main() {
HIPCHECK(hipFree(Z_d));
} else {
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
}
}
}
+3
Просмотреть файл
@@ -81,6 +81,9 @@ int main() {
HIPCHECK(hipFree(Z_d));
} else {
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
}
}
}
+3
Просмотреть файл
@@ -77,6 +77,9 @@ int main() {
HIPCHECK(hipFree(Z_d));
} else {
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
}
}
}
passed();
+3
Просмотреть файл
@@ -83,6 +83,9 @@ int main() {
HIPCHECK(hipFree(Z_d));
} else {
std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
}
}
}
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
+1 -1
Просмотреть файл
@@ -20,7 +20,7 @@
// Test for hipMemset2D functionality for different width and height values
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST_NAMED: %t hipMemset2D-basic
* TEST_NAMED: %t hipMemset2D-dim1 --width2D 10 --height2D 10 --memsetWidth 4 --memsetHeight 4
* TEST_NAMED: %t hipMemset2D-dim2 --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 40
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@
// and also launch hipMemcpyAsync() api on the same stream. This test case is simulate the scenario
// reported in SWDEV-181598.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@
// and also launch hipMemcpyAsync() api. This test case is simulate the scenario
// reported in SWDEV-181598.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
+27
Просмотреть файл
@@ -0,0 +1,27 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/hip_runtime_api.h>
#include "test_common.h"
int main() {
hipSharedMemConfig_t config;
HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL));
HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config));
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше