Merge branch 'master' into amd-master-next

Change-Id: Ib7e7824073f4dfc391fb3833fc90e11b327d3c22
This commit is contained in:
Laurent Morichetti
2020-02-05 14:56:08 -08:00
34 changed files with 667 additions and 919 deletions
+1
View File
@@ -25,6 +25,7 @@ endmacro()
# Setup version information
#############################
# Determine HIP_BASE_VERSION
set(ENV{HIP_PATH} "")
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version
OUTPUT_VARIABLE HIP_BASE_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
Vendored
+1 -1
View File
@@ -394,7 +394,7 @@ cuda_10_x:
// Block of string constants customizing behavior for cuda
String nvcc_ver = 'cuda-10.x'
String from_image = 'ci_test_nodes/cuda-10.x/ubuntu-16.04:latest'
String inside_args = '--runtime=nvidia';
String inside_args = '--gpus all';
// Checkout source code, dependencies and version files
String source_hip_rel = checkout_and_version( nvcc_ver )
+16 -15
View File
@@ -6,7 +6,7 @@ Key features include:
* HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
* HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas, classes, namespaces, and more.
* HIP allows developers to use the "best" development environment and tools on each target platform.
* The "hipify" tool automatically converts source from CUDA to HIP.
* The [HIPIFY](hipify-clang/README.md) tools automatically convert source from CUDA to HIP.
* Developers can specialize for the platform (CUDA or hcc) to tune for performance or handle tricky cases
New projects can be developed directly in the portable HIP C++ language and can run on either NVIDIA or AMD platforms. Additionally, HIP provides porting tools which make it easy to port existing CUDA codes to the HIP layer, with no loss of performance as compared to the original CUDA application. HIP is not intended to be a drop-in replacement for CUDA, and developers should expect to do some manual coding and performance tuning work to complete the port.
@@ -36,7 +36,7 @@ HIP releases are typically of two types. The tag naming convention is different
- [HIP Profiling ](docs/markdown/hip_profiling.md)
- [HIP Debugging](docs/markdown/hip_debugging.md)
- [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenCL)
- [hipify-clang](hipify-clang/README.md)
- [HIPIFY](hipify-clang/README.md)
- Supported CUDA APIs:
* [Runtime API](docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
* [Driver API](docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
@@ -71,7 +71,7 @@ hipLaunchKernel(vector_square, /* compute kernel*/
dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/, /* launch config*/
C_d, A_d, N); /* arguments to the compute kernel */
hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
```
@@ -88,7 +88,7 @@ __global__ void
vector_square(T *C_d, const T *A_d, size_t N)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t stride = hipBlockDim_x * hipGridDim_x;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
@@ -100,12 +100,12 @@ The HIP Runtime API code and compute kernel definition can exist in the same sou
## HIP Portability and Compiler Technology
HIP C++ code can be compiled with either :
- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs. The header file contains mostly inlined
- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs. The header file contains mostly inlined
functions and thus has very low overhead - developers coding in HIP should expect the same performance as coding in native CUDA. The code is then
compiled with nvcc, the standard C++ compiler provided with the CUDA SDK. Developers can use any tools supported by the CUDA SDK including the CUDA
profiler and debugger.
- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler. The HIP runtime implements HIP streams, events, and memory APIs,
and is a object library that is linked with the application. The source code for all headers and the library implementation is available on GitHub.
- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler. The HIP runtime implements HIP streams, events, and memory APIs,
and is a object library that is linked with the application. The source code for all headers and the library implementation is available on GitHub.
HIP developers on ROCm can use AMD's CodeXL for debugging and profiling.
Thus HIP source code can be compiled to run on either platform. Platform-specific features can be isolated to a specific platform using conditional compilation. Thus HIP
@@ -114,7 +114,7 @@ provides source portability to either platform. HIP provides the _hipcc_ compi
## Examples and Getting Started:
* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses hipify to convert a simple app from CUDA to HIP:
* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses any of [HIPIFY](hipify-clang/README.md) tools to convert a simple app from CUDA to HIP:
```shell
@@ -136,19 +136,20 @@ The GitHub repository [HIP-Examples](https://github.com/ROCm-Developer-Tools/HIP
The README with the procedures and tips the team used during this porting effort is here: [Rodinia Porting Guide](https://github.com/ROCm-Developer-Tools/HIP-Examples/blob/master/rodinia_3.0/hip/README.hip_porting)
## Tour of the HIP Directories
* **include**:
* **include**:
* **hip_runtime_api.h** : Defines HIP runtime APIs and can be compiled with many standard Linux compilers (hcc, GCC, ICC, CLANG, etc), in either C or C++ mode.
* **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions. hip_runtime.h can only be compiled with hcc.
* **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
* **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
* **hcc.h** : Includes interop APIs for HIP and HCC
* **bin**: Tools and scripts to help with hip porting
* **hipify** : Tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins.
* **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries.
* **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc)
* **hipexamine.sh** : Script to scan directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported)
* **hipify-perl** : Script based tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins.
* **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform and include appropriate platform-specific headers and libraries.
* **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc.)
* **hipexamine-perl.sh** : Script to scan the directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported).
* **hipconvertinplace-perl.sh** : Script to scan the directory, find all code, and convert the found CUDA code to HIP reporting all unconverted things.
* **doc**: Documentation - markdown and doxygen info
* **doc**: Documentation - markdown and doxygen info.
## Reporting an issue
Use the [GitHub issue tracker](https://github.com/ROCm-Developer-Tools/HIP/issues).
+16 -19
View File
@@ -115,10 +115,8 @@ install(
PATTERN "complex"
PATTERN "new"
PATTERN "ppc_wrappers" EXCLUDE
PATTERN "openmp_wrappers" EXCLUDE
)
PATTERN "openmp_wrappers" EXCLUDE)
# Package: hipify-clang
if (UNIX)
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hipify-clang)
configure_file(packaging/hipify-clang.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
@@ -132,8 +130,8 @@ if (UNIX)
file(GENERATE OUTPUT ${PROJECT_BINARY_DIR}/fixnames
CONTENT "pwd; for i in *.deb; do mv \"\$i\" \"\${i/.deb/-amd64.deb}\" ; done
for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done
")
for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done")
add_custom_target(package_hipify-clang
COMMAND bash ${PROJECT_BINARY_DIR}/fixnames
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
@@ -155,45 +153,44 @@ if (HIPIFY_CLANG_TESTS)
require_program(lit)
require_program(FileCheck)
# Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
# value of --cuda-path for the test runs.
find_package(CUDA REQUIRED)
if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
(CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
(CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
(CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
(CUDA_VERSION VERSION_GREATER "9.2" AND LLVM_PACKAGE_VERSION VERSION_LESS "8.0") OR
(CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0"))
message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
(CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0") OR
(CUDA_VERSION VERSION_GREATER "10.1" AND LLVM_PACKAGE_VERSION VERSION_LESS "10.0"))
message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by LLVM ${LLVM_PACKAGE_VERSION}.")
if (CUDA_VERSION_MAJOR VERSION_LESS "7")
message(STATUS "Please install CUDA 7.0 or higher.")
elseif (CUDA_VERSION_MAJOR VERSION_LESS "8")
message(STATUS "Please install clang 3.8 or higher.")
message(STATUS "Please install LLVM + clang 3.8 or higher.")
elseif (CUDA_VERSION_MAJOR VERSION_LESS "9")
message(STATUS "Please install clang 4.0 or higher.")
message(STATUS "Please install LLVM + clang 4.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "9.0")
message(STATUS "Please install clang 6.0 or higher.")
message(STATUS "Please install LLVM + clang 6.0 or higher.")
elseif (CUDA_VERSION_MAJOR VERSION_LESS "10")
message(STATUS "Please install clang 7.0 or higher.")
message(STATUS "Please install LLVM + clang 7.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "10.0")
message(STATUS "Please install clang 8.0 or higher.")
message(STATUS "Please install LLVM + clang 8.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "10.1")
message(STATUS "Please install clang 9.0 or higher.")
message(STATUS "Please install LLVM + clang 9.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "10.2")
message(STATUS "Please install LLVM + clang 10.0 or higher.")
endif()
endif()
configure_file(
${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
@ONLY
)
@ONLY)
add_lit_testsuite(test-hipify "Running HIPify regression tests"
${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang
PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
ARGS -v
DEPENDS hipify-clang
)
DEPENDS hipify-clang)
add_custom_target(test-hipify-clang)
add_dependencies(test-hipify-clang test-hipify)
+153 -91
View File
@@ -1,42 +1,51 @@
# hipify-clang
`hipify-clang` is a clang-based tool to translate CUDA source code into portable HIP C++ automatically.
# HIPIFY
### Tools to translate CUDA source code into portable HIP C++ automatically
## Table of Contents
<!-- toc -->
- [Supported CUDA APIs](#cuda-apis)
- [Dependencies](#dependencies)
- [Build and install](#build-and-install)
- [hipify-clang](#clang)
* [Dependencies](#dependencies)
* [Usage](#hipify-clang-usage)
* [Building](#building)
* [Testing](#testing)
* [Linux](#linux)
* [Windows](#windows)
- [Running and using hipify-clang](#running-and-using-hipify-clang)
* [hipify-perl](#perl)
- [hipify-perl](#perl)
* [Usage](#hipify-perl-usage)
* [Building](#hipify-perl-building)
- [Supported CUDA APIs](#cuda-apis)
- [Disclaimer](#disclaimer)
<!-- tocstop -->
## <a name="cuda-apis"></a> Supported CUDA APIs
## <a name="clang"></a> hipify-clang
- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)
`hipify-clang` is a clang-based tool for translation CUDA sources into HIP sources.
It translates CUDA source into an abstract syntax tree, which is being traversed by transformation matchers.
After applying all the matchers, the output HIP source is produced.
## <a name="dependencies"></a> Dependencies
**Advantages:**
1. It is a translator; thus, any even very complicated constructs will be parsed successfully, or an error will be reported.
2. It supports clang options like [`-I`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-i-dir), [`-D`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-d-macro), [`--cuda-path`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-cuda-path), etc.
3. Seamless support of new CUDA versions as it is clang's responsibility.
4. Ease in support.
**Disadvantages:**
1. The main advantage is also the main disadvantage: the input CUDA code should be correct; incorrect code wouldn't be translated to HIP.
2. CUDA should be installed and provided in case of multiple installations by `--cuda-path` option.
3. All the includes and defines should be provided to transform code successfully.
### <a name="dependencies"></a> hipify-clang: dependencies
`hipify-clang` requires:
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1).
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
| **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** |
|:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -58,21 +67,41 @@
| [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
| [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + |
`*` Download the patch and unpack it into your LLVM distributive directory; a few header files will be overwritten; rebuilding of LLVM is not needed.
`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.
In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.
Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
## <a name="build-and-install"></a> Build and install
### <a name="hipify-clang-usage"></a> hipify-clang: usage
### <a name="building"></a> Build
To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang.
For example:
```shell
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
```
`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
were compiling the input file. For example:
```bash
./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
```
The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
For a list of `hipify-clang` options, run `hipify-clang --help`.
### <a name="building"></a> hipify-clang: building
Assuming this repository is at `./HIP`:
```shell
```bash
cd hipify-clang
mkdir build dist
cd build
@@ -84,30 +113,27 @@ cmake \
make -j install
```
On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 15 2017` instead of `make.`
On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 16 2019` instead of `make.`
Please, see [hipify-clang: Windows](#windows) for the supported tools for building.
Debug build type `-DCMAKE_BUILD_TYPE=Debug` is also supported and tested; `LLVM+CLANG` should be built in `Debug` mode as well.
64-bit build mode (`-Thost=x64` on Windows) is also supported; `LLVM+CLANG` should be built in 64-bit mode as well.
The binary can then be found at `./dist/bin/hipify-clang`.
### <a name="testing"></a> Testing
### <a name="testing"></a> hipify-clang: testing
`hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
`hipify-clang` has unit tests using `LLVM` [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
**LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.**
`LLVM+CLANG` should be built from sources, pre-built binaries are not exhaustive for testing.
**LLVM 9.0.1 or older:**
To run it:
1. download [`LLVM`](http://releases.llvm.org/9.0.1/llvm-9.0.1.src.tar.xz)+[`CLANG`](http://releases.llvm.org/9.0.1/cfe-9.0.1.src.tar.xz) sources;
2. build [`LLVM+CLANG`](http://llvm.org/docs/CMake.html):
```shell
cd llvm
mkdir build dist
cd build
```
2. build [`LLVM+CLANG`](http://releases.llvm.org/9.0.0/docs/CMake.html):
- **Linux**:
```shell
**Linux**:
```bash
cmake \
-DCMAKE_INSTALL_PREFIX=../dist \
-DLLVM_SOURCE_DIR=../llvm \
@@ -116,8 +142,7 @@ To run it:
../llvm
make -j install
```
- **Windows**:
**Windows**:
```shell
cmake \
-G "Visual Studio 16 2019" \
@@ -129,9 +154,38 @@ To run it:
-Thost=x64 \
../llvm
```
Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
**LLVM 10.0.0 or newer:**
1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources;
2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
**Linux**:
```bash
cmake \
-DCMAKE_INSTALL_PREFIX=../dist \
-DLLVM_SOURCE_DIR=../llvm-project \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-DLLVM_ENABLE_PROJECTS="clang" \
-DCMAKE_BUILD_TYPE=Release \
../llvm-project/llvm
make -j install
```
**Windows**:
```shell
cmake \
-G "Visual Studio 16 2019" \
-A x64 \
-DCMAKE_INSTALL_PREFIX=../dist \
-DLLVM_SOURCE_DIR=../llvm-project \
-DLLVM_TARGETS_TO_BUILD="NVPTX" \
-DLLVM_ENABLE_PROJECTS="clang" \
-DCMAKE_BUILD_TYPE=Release \
-Thost=x64 \
../llvm-project/llvm
```
Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
3. Ensure [`CUDA`](https://developer.nvidia.com/cuda-toolkit-archive) of minimum version 7.0 is installed.
@@ -161,7 +215,7 @@ To run it:
5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
6. Ensure `lit` and `FileCheck` are installed - these are distributed with LLVM.
6. Ensure `lit` and `FileCheck` are installed - these are distributed with `LLVM`.
* Install `lit` into `python`:
@@ -185,23 +239,15 @@ To run it:
7. Set `HIPIFY_CLANG_TESTS` option turned on: `-DHIPIFY_CLANG_TESTS=1`.
8. Run `cmake`:
* [***Linux***](#linux)
* [***Windows***](#windows)
8. Build and run tests:
9. Run tests:
- ***Linux***: `make test-hipify`.
- ***Windows***: run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
### <a name="linux"></a >Linux
### <a name="Linux"></a > hipify-clang: Linux
On Linux the following configurations are tested:
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 9.0.1, CUDA 8.0 - 10.1 Update 2, cudnn-5.1.10 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
Minimum build system requirements for the above configurations:
@@ -209,7 +255,7 @@ Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.
Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:
```shell
```bash
cmake
-DHIPIFY_CLANG_TESTS=1 \
-DCMAKE_BUILD_TYPE=Release \
@@ -345,7 +391,7 @@ Testing Time: 3.07s
Expected Passes : 67
[100%] Built target test-hipify
```
### <a name="windows"></a >Windows
### <a name="windows"></a > hipify-clang: Windows
On Windows 10 the following configurations are tested:
@@ -353,11 +399,11 @@ LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32
LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32
LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32
Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2:
Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2:
Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.2, Visual Studio 2017 (15.5.2) - 2019 (16.4.2).
Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).
Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
@@ -392,45 +438,62 @@ cmake
-- Build files have been written to: f:/HIP/hipify-clang/build
```
## <a name="running-and-using-hipify-clang"></a> Running and using hipify-clang
Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
To process a file, `hipify-clang` needs access to the same headers that would be needed to compile it with clang.
## <a name="perl"></a> hipify-perl
For example:
`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.
**Advantages:**
1. Ease in use.
2. It doesn't check the input source CUDA code for correctness.
3. It doesn't have dependencies on 3rd party tools, including CUDA.
**Disadvantages:**
1. Current disability (and difficulty in implementing) of transforming the following constructs:
* macros expansion;
* namespaces:
- redefines of CUDA entities in user namespaces;
- using directive;
* templates (some cases);
* device/host function calls distinguishing;
* header files correct injection;
* complicated argument lists parsing.
2. Difficulties in supporting.
### <a name="hipify-perl-usage"></a> hipify-perl: usage
```shell
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
perl hipify-perl square.cu > square.cu.hip
```
`hipify-clang` arguments are given first, followed by a separator, and then the arguments you'd pass to `clang` if you
were compiling the input file. The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code)
may be useful.
### <a name="hipify-perl-building"></a> hipify-perl: building
For a list of `hipify-clang` options, run `hipify-clang --help`.
To generate `hipify-perl`, run `hipify-clang --perl`. Output directory for the generated `hipify-perl` file might be specified by `--o-hipify-perl-dir` option.
### <a name="perl"></a> hipify-perl
## <a name="cuda-apis"></a> Supported CUDA APIs
To produce a Perl-based script `hipify-perl`, run `hipify-clang --perl`.
The `hipify-perl` script, unlike the `hipify-clang`, being based on regular expressions, and not on an abstract syntax tree, has several gaps:
1. macros expansion;
2. namespaces:
- redefines of CUDA entities in user namespaces;
- using directive;
3. templates (some cases);
4. device/host function calls distinguishing;
5. header files correct injection;
6. complicated argument lists parsing.
Nonetheless, `hipify-perl` is easy in use and doesn't check the input source CUDA code for correctness.
- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)
## <a name="disclaimer"></a> Disclaimer
@@ -438,5 +501,4 @@ The information contained herein is for informational purposes only, and is subj
AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
Copyright (c) 2014-2019 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved.
+1 -1
View File
@@ -48,7 +48,7 @@ endif()
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda >= 7.0")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda (>= 7.0)")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV "NO")
+2 -2
View File
@@ -213,7 +213,7 @@ StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exp
*/
std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
else return readSourceText(SM, arg->getSourceRange());
else return std::string(readSourceText(SM, arg->getSourceRange()));
}
} // anonymous namespace
@@ -427,7 +427,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchRes
clang::LangOptions LO;
LO.CUDA = true;
clang::PrintingPolicy policy(LO);
typeName = BT->getName(policy);
typeName = std::string(BT->getName(policy));
}
} else {
typeName = QT.getAsString();
+1 -1
View File
@@ -44,7 +44,7 @@ ct::Replacements &getReplacements(ct::RefactoringTool &Tool, StringRef file) {
#if LLVM_VERSION_MAJOR > 3
// getReplacements() now returns a map from filename to Replacements - so create an entry
// for this source file and return a reference to it.
return Tool.getReplacements()[file];
return Tool.getReplacements()[std::string(file)];
#else
return Tool.getReplacements();
#endif
+9 -4
View File
@@ -108,7 +108,7 @@ void sortInputFiles(int argc, const char **argv, std::vector<std::string> &files
files.assign(sortedFiles.begin(), sortedFiles.end());
}
void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath) {
void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath, const char *hipify_exe) {
if (!IncludeDirs.empty()) {
for (std::string s : IncludeDirs) {
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(s.c_str(), ct::ArgumentInsertPosition::BEGIN));
@@ -122,9 +122,14 @@ void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSou
}
}
// Includes for clang's CUDA wrappers for using by packaged hipify-clang
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include", ct::ArgumentInsertPosition::BEGIN));
static int Dummy;
std::string hipify = llvm::sys::fs::getMainExecutable(hipify_exe, (void *)&Dummy);
std::string clang_inc_path = std::string(llvm::sys::path::parent_path(hipify));
clang_inc_path.append("/include");
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include/cuda_wrappers", ct::ArgumentInsertPosition::BEGIN));
clang_inc_path.append("/cuda_wrappers");
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
// Ensure at least c++11 is used.
std::string stdCpp = "-std=c++11";
@@ -315,7 +320,7 @@ int main(int argc, const char **argv) {
ct::RefactoringTool Tool(OptionsParser.getCompilations(), std::string(tmpFile.c_str()));
ct::Replacements &replacementsToUse = llcompat::getReplacements(Tool, tmpFile.c_str());
ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
appendArgumentsAdjusters(Tool, sSourceAbsPath);
appendArgumentsAdjusters(Tool, sSourceAbsPath, argv[0]);
Statistics &currentStat = Statistics::current();
// Hipify _all_ the things!
if (Tool.runAndSave(&actionFactory)) {
+3 -2
View File
@@ -43,9 +43,10 @@ typedef struct hipChannelFormatDesc {
enum hipChannelFormatKind f;
}hipChannelFormatDesc;
#define HIP_TRSF_NORMALIZED_COORDINATES 0x01
#define HIP_TRSF_READ_AS_INTEGER 0x00
#define HIP_TRSA_OVERRIDE_FORMAT 0x01
#define HIP_TRSF_READ_AS_INTEGER 0x01
#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
#define HIP_TRSF_SRGB 0x10
typedef enum hipArray_Format {
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
@@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
dynSharedMemPerBlk, blockSizeLimit);
}
template <typename F>
inline
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
uint32_t blockSize, size_t dynSharedMemPerBlk) {
using namespace hip_impl;
hip_impl::hip_init();
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
}
template <typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
@@ -106,7 +106,7 @@ __CG_STATIC_QUALIFIER__ uint32_t size() {
}
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
// Compute global id of the workgroup to which the current threads belongs to
// Compute global id of the workgroup to which the current thread belongs to
uint32_t blkIdx =
(uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
(hipBlockIdx_y * hipGridDim_x) +
@@ -115,7 +115,7 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
// Compute total number of threads being passed to reach current workgroup
// within grid
uint32_t num_threads_till_current_workgroup =
(uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z));
(uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
// Compute thread local rank within current workgroup
uint32_t local_thread_rank =
+16 -17
View File
@@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
/**
* @brief Returns occupancy for a device function.
@@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
#if __HIP_VDI__ && !defined(__HCC__)
/**
@@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address,
} /* extern "c" */
#endif
#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
template <typename F>
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) {
return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize,
dynSharedMemPerBlk);
}
template <typename F>
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags);
}
#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
#if USE_PROF_API
#include <hip/hcc_detail/hip_prof_str.h>
#endif
@@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
return hipSuccess;
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
}
#if __HIP_VDI__ && !defined(__HCC__)
template <class T>
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+2
View File
@@ -124,6 +124,7 @@ typedef struct hipDeviceProp_t {
unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
size_t memPitch; ///<Maximum pitch in bytes allowed by memory copies
size_t textureAlignment; ///<Alignment requirement for textures
size_t texturePitchAlignment; ///<Pitch alignment requirement for texture references bound to pitched memory
int kernelExecTimeoutEnabled; ///<Run time limit for kernels executed on the device
int ECCEnabled; ///<Device has ECC support enabled
int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0
@@ -321,6 +322,7 @@ typedef enum hipDeviceAttribute_t {
hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies
hipDeviceAttributeTextureAlignment, ///<Alignment requirement for textures
hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
hipDeviceAttributeKernelExecTimeout, ///<Run time limit for kernels executed on the device
hipDeviceAttributeCanMapHostMemory, ///<Device can map host memory into device address space
hipDeviceAttributeEccEnabled ///<Device has ECC support enabled
@@ -1130,6 +1130,7 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
p_prop->memPitch = cdprop.memPitch;
p_prop->textureAlignment = cdprop.textureAlignment;
p_prop->texturePitchAlignment = cdprop.texturePitchAlignment;
p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
p_prop->ECCEnabled = cdprop.ECCEnabled;
p_prop->tccDriver = cdprop.tccDriver;
@@ -1244,6 +1245,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
case hipDeviceAttributeTextureAlignment:
cdattr = cudaDevAttrTextureAlignment;
break;
case hipDeviceAttributeTexturePitchAlignment:
cdattr = cudaDevAttrTexturePitchAlignment;
break;
case hipDeviceAttributeKernelExecTimeout:
cdattr = cudaDevAttrKernelExecTimeout;
break;
+1 -1
View File
@@ -3,6 +3,6 @@
ROCMDIR=@ROCM_PATH@
HIPDIR=$ROCMDIR/hip
if [ -d $ROCMDIR]
if [ -d $ROCMDIR] ; then
ln -s -f $ROCMDIR /opt/rocm
fi
+1 -1
View File
@@ -1,5 +1,5 @@
#!/bin/bash
if [ -L "/opt/rocm" ]
if [ -L "/opt/rocm" ] ; then
unlink /opt/rocm
fi
+6 -7
View File
@@ -2,19 +2,18 @@ HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11
EXE=hipDispatchLatency
CXXFLAGS = -O3
all: ${EXE}
all: test_kernel.code ${EXE}
$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp
$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@
install: $(EXE)
cp $(EXE) $(HIP_PATH)/bin
$(EXE): hipDispatchLatency.cpp
$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@
test_kernel.code: test_kernel.cpp
$(HIP_PATH)/bin/hipcc --genco $(GENCO_FLAGS) $^ -o $@
clean:
rm -f *.o $(EXE)
@@ -1,473 +0,0 @@
#include "ResultDatabase.h"
#include <cfloat>
#include <algorithm>
#include <cmath>
#include <iomanip>
using namespace std;
#define SORT_BY_NAME 0
#define SORT_RETAIN_ATTS_ORDER 1
bool ResultDatabase::Result::operator<(const Result& rhs) const {
if (test < rhs.test) return true;
if (test > rhs.test) return false;
#if (SORT_RETAIN_ATTS_ORDER == 0)
// For ties, sort by the value of the attribute:
if (atts < rhs.atts) return true;
if (atts > rhs.atts) return false;
#endif
return false; // less-operator returns false on equal
}
double ResultDatabase::Result::GetMin() const {
double r = FLT_MAX;
for (int i = 0; i < value.size(); i++) {
r = min(r, value[i]);
}
return r;
}
double ResultDatabase::Result::GetMax() const {
double r = -FLT_MAX;
for (int i = 0; i < value.size(); i++) {
r = max(r, value[i]);
}
return r;
}
double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }
double ResultDatabase::Result::GetPercentile(double q) const {
int n = value.size();
if (n == 0) return FLT_MAX;
if (n == 1) return value[0];
if (q <= 0) return value[0];
if (q >= 100) return value[n - 1];
double index = ((n + 1.) * q / 100.) - 1;
vector<double> sorted = value;
sort(sorted.begin(), sorted.end());
if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));
int index_lo = int(index);
double frac = index - index_lo;
if (frac == 0) return sorted[index_lo];
double lo = sorted[index_lo];
double hi = sorted[index_lo + 1];
return lo + (hi - lo) * frac;
}
double ResultDatabase::Result::GetMean() const {
double r = 0;
for (int i = 0; i < value.size(); i++) {
r += value[i];
}
return r / double(value.size());
}
double ResultDatabase::Result::GetStdDev() const {
double r = 0;
double u = GetMean();
if (u == FLT_MAX) return FLT_MAX;
for (int i = 0; i < value.size(); i++) {
r += (value[i] - u) * (value[i] - u);
}
r = sqrt(r / value.size());
return r;
}
void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
const vector<double>& values) {
for (int i = 0; i < values.size(); i++) {
AddResult(test, atts, unit, values[i]);
}
}
static string RemoveAllButLeadingSpaces(const string& a) {
string b;
int n = a.length();
int i = 0;
while (i < n && a[i] == ' ') {
b += a[i];
++i;
}
for (; i < n; i++) {
if (a[i] != ' ' && a[i] != '\t') b += a[i];
}
return b;
}
void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
const string& unit_orig, double value) {
string test = RemoveAllButLeadingSpaces(test_orig);
string atts = RemoveAllButLeadingSpaces(atts_orig);
string unit = RemoveAllButLeadingSpaces(unit_orig);
int index;
for (index = 0; index < results.size(); index++) {
if (results[index].test == test && results[index].atts == atts) {
if (results[index].unit != unit) throw "Internal error: mixed units";
break;
}
}
if (index >= results.size()) {
Result r;
r.test = test;
r.atts = atts;
r.unit = unit;
results.push_back(r);
}
results[index].value.push_back(value);
}
// ****************************************************************************
// Method: ResultDatabase::DumpDetailed
//
// Purpose:
// Writes the full results, including all trials.
//
// Arguments:
// out where to print
//
// Programmer: Jeremy Meredith
// Creation: August 14, 2009
//
// Modifications:
// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010
// Renamed to DumpDetailed to make room for a DumpSummary.
//
// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
// Added note about (*) missing value tag.
//
// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010
// Changed note about missing values to be worded a little better.
//
// ****************************************************************************
void ResultDatabase::DumpDetailed(ostream& out) {
vector<Result> sorted(results);
#if SORT_BY_NAME
stable_sort(sorted.begin(), sorted.end());
#endif
const int testNameW = 24;
const int attW = 12;
const int fieldW = 11;
out << std::fixed << right << std::setprecision(4);
int maxtrials = 1;
for (int i = 0; i < sorted.size(); i++) {
if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
}
// TODO: in big parallel runs, the "trials" are the procs
// and we really don't want to print them all out....
out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
<< "mean\t"
<< "stddev\t"
<< "min\t"
<< "max\t";
for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
out << endl;
for (int i = 0; i < sorted.size(); i++) {
Result& r = sorted[i];
out << setw(testNameW) << r.test + "\t";
out << setw(attW) << r.atts + "\t";
out << setw(fieldW) << r.unit + "\t";
if (r.GetMedian() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMedian() << "\t";
if (r.GetMean() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMean() << "\t";
if (r.GetStdDev() == FLT_MAX)
out << "N/A\t";
else
out << r.GetStdDev() << "\t";
if (r.GetMin() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMin() << "\t";
if (r.GetMax() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMax() << "\t";
for (int j = 0; j < r.value.size(); j++) {
if (r.value[j] == FLT_MAX)
out << "N/A\t";
else
out << r.value[j] << "\t";
}
out << endl;
}
if (0) {
out << endl
<< "Note: Any results marked with (*) had missing values." << endl
<< " This can occur on systems with a mixture of" << endl
<< " device types or architectural capabilities." << endl;
}
}
// ****************************************************************************
// Method: ResultDatabase::DumpDetailed
//
// Purpose:
// Writes the summary results (min/max/stddev/med/mean), but not
// every individual trial.
//
// Arguments:
// out where to print
//
// Programmer: Jeremy Meredith
// Creation: November 10, 2010
//
// Modifications:
// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
// Added note about (*) missing value tag.
//
// ****************************************************************************
void ResultDatabase::DumpSummary(ostream& out) {
vector<Result> sorted(results);
#if SORT_BY_NAME
stable_sort(sorted.begin(), sorted.end());
#endif
const int testNameW = 32;
const int attW = 12;
const int fieldW = 9;
out << std::fixed << right << std::setprecision(2);
// TODO: in big parallel runs, the "trials" are the procs
// and we really don't want to print them all out....
out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
<< "median\t"
<< "mean\t"
<< "stddev\t"
<< "min\t"
<< "max\t";
out << endl;
for (int i = 0; i < sorted.size(); i++) {
Result& r = sorted[i];
out << setw(testNameW) << r.test + "\t";
out << setw(attW) << r.atts + "\t";
out << setw(fieldW) << r.unit + "\t";
if (r.GetMedian() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMedian() << "\t";
if (r.GetMean() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMean() << "\t";
if (r.GetStdDev() == FLT_MAX)
out << "N/A\t";
else
out << r.GetStdDev() << "\t";
if (r.GetMin() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMin() << "\t";
if (r.GetMax() == FLT_MAX)
out << "N/A\t";
else
out << r.GetMax() << "\t";
out << endl;
}
if (0) {
out << endl
<< "Note: results marked with (*) had missing values such as" << endl
<< "might occur with a mixture of architectural capabilities." << endl;
}
}
// ****************************************************************************
// Method: ResultDatabase::ClearAllResults
//
// Purpose:
// Clears all existing results from the ResultDatabase; used for multiple passes
// of the same test or multiple tests.
//
// Arguments:
//
// Programmer: Jeffrey Young
// Creation: September 10th, 2014
//
// Modifications:
//
//
// ****************************************************************************
void ResultDatabase::ClearAllResults() { results.clear(); }
// ****************************************************************************
// Method: ResultDatabase::DumpCsv
//
// Purpose:
// Writes either detailed or summary results (min/max/stddev/med/mean), but not
// every individual trial.
//
// Arguments:
// out file to print CSV results
//
// Programmer: Jeffrey Young
// Creation: August 28th, 2014
//
// Modifications:
//
// ****************************************************************************
void ResultDatabase::DumpCsv(string fileName) {
bool emptyFile;
vector<Result> sorted(results);
#if SORT_BY_NAME
stable_sort(sorted.begin(), sorted.end());
#endif
// Check to see if the file is empty - if so, add the headers
emptyFile = this->IsFileEmpty(fileName);
// Open file and append by default
ofstream out;
out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);
// Add headers only for empty files
if (emptyFile) {
// TODO: in big parallel runs, the "trials" are the procs
// and we really don't want to print them all out....
out << "test, "
<< "atts, "
<< "units, "
<< "median, "
<< "mean, "
<< "stddev, "
<< "min, "
<< "max, ";
out << endl;
}
for (int i = 0; i < sorted.size(); i++) {
Result& r = sorted[i];
out << r.test << ", ";
out << r.atts << ", ";
out << r.unit << ", ";
if (r.GetMedian() == FLT_MAX)
out << "N/A, ";
else
out << r.GetMedian() << ", ";
if (r.GetMean() == FLT_MAX)
out << "N/A, ";
else
out << r.GetMean() << ", ";
if (r.GetStdDev() == FLT_MAX)
out << "N/A, ";
else
out << r.GetStdDev() << ", ";
if (r.GetMin() == FLT_MAX)
out << "N/A, ";
else
out << r.GetMin() << ", ";
if (r.GetMax() == FLT_MAX)
out << "N/A, ";
else
out << r.GetMax() << ", ";
out << endl;
}
out << endl;
out.close();
}
// ****************************************************************************
// Method: ResultDatabase::IsFileEmpty
//
// Purpose:
// Returns whether a file is empty - used as a helper for CSV printing
//
// Arguments:
// file The input file to check for emptiness
//
// Programmer: Jeffrey Young
// Creation: August 28th, 2014
//
// Modifications:
//
// ****************************************************************************
bool ResultDatabase::IsFileEmpty(string fileName) {
bool fileEmpty;
ifstream file(fileName.c_str());
// If the file doesn't exist it is by definition empty
if (!file.good()) {
return true;
} else {
fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
file.close();
return fileEmpty;
}
// Otherwise, return false
return false;
}
// ****************************************************************************
// Method: ResultDatabase::GetResultsForTest
//
// Purpose:
// Returns a vector of results for just one test name.
//
// Arguments:
// test the name of the test results to search for
//
// Programmer: Jeremy Meredith
// Creation: December 3, 2010
//
// Modifications:
//
// ****************************************************************************
vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
// get only the given test results
vector<Result> retval;
for (int i = 0; i < results.size(); i++) {
Result& r = results[i];
if (r.test == test) retval.push_back(r);
}
return retval;
}
// ****************************************************************************
// Method: ResultDatabase::GetResults
//
// Purpose:
// Returns all the results.
//
// Arguments:
//
// Programmer: Jeremy Meredith
// Creation: December 3, 2010
//
// Modifications:
//
// ****************************************************************************
const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
@@ -1,89 +0,0 @@
#ifndef RESULT_DATABASE_H
#define RESULT_DATABASE_H
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <cfloat>
using std::ifstream;
using std::ofstream;
using std::ostream;
using std::string;
using std::vector;
// ****************************************************************************
// Class: ResultDatabase
//
// Purpose:
// Track numerical results as they are generated.
// Print statistics of raw results.
//
// Programmer: Jeremy Meredith
// Creation: June 12, 2009
//
// Modifications:
// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
// Split timing reports into detailed and summary. E.g. for serial code,
// we might report all trial values, but skip them in parallel.
//
// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010
// Added check for missing value tag.
//
// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010
// Added percentile statistic.
//
// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010
// Added a method to extract a subset of results based on test name. Also,
// the Result class is now public, so that clients can use them directly.
// Added a GetResults method as well, and made several functions const.
//
// ****************************************************************************
class ResultDatabase {
public:
//
// A performance result for a single SHOC benchmark run.
//
struct Result {
string test; // e.g. "readback"
string atts; // e.g. "pagelocked 4k^2"
string unit; // e.g. "MB/sec"
vector<double> value; // e.g. "837.14"
double GetMin() const;
double GetMax() const;
double GetMedian() const;
double GetPercentile(double q) const;
double GetMean() const;
double GetStdDev() const;
bool operator<(const Result& rhs) const;
bool HadAnyFLTMAXValues() const {
for (int i = 0; i < value.size(); ++i) {
if (value[i] >= FLT_MAX) return true;
}
return false;
}
};
protected:
vector<Result> results;
public:
void AddResult(const string& test, const string& atts, const string& unit, double value);
void AddResults(const string& test, const string& atts, const string& unit,
const vector<double>& values);
vector<Result> GetResultsForTest(const string& test);
const vector<Result>& GetResults() const;
void ClearAllResults();
void DumpDetailed(ostream&);
void DumpSummary(ostream&);
void DumpCsv(string fileName);
private:
bool IsFileEmpty(string fileName);
};
#endif
@@ -1,16 +1,13 @@
/*
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,142 +18,134 @@ THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
#ifdef __HIP_PLATFORM_HCC__
#include "hip/hip_ext.h"
#endif
#include <iostream>
#include <time.h>
#include "ResultDatabase.h"
#define PRINT_PROGRESS 0
#define check(cmd) \
{ \
hipError_t status = cmd; \
if (status != hipSuccess) { \
printf("error: '%s'(%d) from %s at %s:%d\n", hipGetErrorString(status), status, #cmd, \
__FILE__, __LINE__); \
abort(); \
} \
}
#define LEN 1024 * 1024
#include <chrono>
#include <algorithm>
#define NUM_GROUPS 1
#define GROUP_SIZE 64
#define TEST_ITERS 20
#define DISPATCHES_PER_TEST 100
#define GROUP_SIZE 1
#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
#define BATCH_SIZE 1000
const unsigned p_tests = 0xfffffff;
#define FILE_NAME "test_kernel.code"
#define KERNEL_NAME "test"
__global__ void EmptyKernel() { }
// HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
__global__ void NearlyNull(float* Ad) {
if (Ad) {
Ad[0] = 42;
}
void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
// skip warm-up runs
auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
auto end_iter = results.end();
// mean
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
total_us += (run_ms * 1000) / batch;
});
mean_us = total_us / TIMING_RUN_COUNT;
// stddev
total_us = 0;
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
float dev_us = ((run_ms * 1000) / batch) - mean_us;
total_us += dev_us * dev_us;
});
stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
// display
printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}
ResultDatabase resultDB;
void stopTest(hipEvent_t start, hipEvent_t stop, const char* msg, int iters) {
float mS = 0;
check(hipEventRecord(stop));
check(hipDeviceSynchronize());
check(hipEventElapsedTime(&mS, start, stop));
resultDB.AddResult(std::string(msg), "", "uS", mS * 1000 / iters);
if (PRINT_PROGRESS & 0x1) {
std::cout << msg << "\t\t" << mS * 1000 / iters << " uS" << std::endl;
}
if (PRINT_PROGRESS & 0x2) {
resultDB.DumpSummary(std::cout);
}
}
int main() {
hipError_t err;
float* Ad;
check(hipMalloc(&Ad, 4));
hipStream_t stream;
check(hipStreamCreate(&stream));
hipEvent_t start, sync, stop;
check(hipEventCreate(&start));
check(hipEventCreateWithFlags(&sync, hipEventBlockingSync));
check(hipEventCreate(&stop));
int main() {
hipStream_t stream0 = 0;
hipDevice_t device;
hipDeviceGet(&device, 0);
hipCtx_t context;
hipCtxCreate(&context, 0, device);
hipModule_t module;
hipFunction_t function;
hipModuleLoad(&module, FILE_NAME);
hipModuleGetFunction(&function, module, KERNEL_NAME);
void* params = nullptr;
std::array<float, TOTAL_RUN_COUNT> results;
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
/************************************************************************************/
/* HIP kernel launch enqueue rate: */
/* Measure time taken to enqueue a kernel on the GPU */
/************************************************************************************/
if (p_tests & 0x1) {
hipEventRecord(start);
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
stopTest(start, stop, "FirstKernelLaunch", 1);
// Timing hipModuleLaunchKernel
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr);
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
}
print_timing("hipModuleLaunchKernel enqueue rate", results);
if (p_tests & 0x2) {
hipEventRecord(start);
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
stopTest(start, stop, "SecondKernelLaunch", 1);
// Timing hipLaunchKernelGGL
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
}
print_timing("hipLaunchKernelGGL enqueue rate", results);
/***********************************************************************************/
/* Single dispatch execution latency using HIP events: */
/* Measures latency to start & finish executing a kernel with GPU-scope visibility */
/***********************************************************************************/
if (p_tests & 0x4) {
for (int t = 0; t < TEST_ITERS; t++) {
hipEventRecord(start);
for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
hipEventRecord(sync);
hipEventSynchronize(sync);
}
stopTest(start, stop, "NullStreamASyncDispatchWait", DISPATCHES_PER_TEST);
}
}
if (p_tests & 0x10) {
for (int t = 0; t < TEST_ITERS; t++) {
hipEventRecord(start);
for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
hipEventRecord(sync);
hipEventSynchronize(sync);
}
stopTest(start, stop, "StreamASyncDispatchWait", DISPATCHES_PER_TEST);
}
}
#if 1
if (p_tests & 0x40) {
for (int t = 0; t < TEST_ITERS; t++) {
hipEventRecord(start);
for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
}
stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
}
}
if (p_tests & 0x80) {
for (int t = 0; t < TEST_ITERS; t++) {
hipEventRecord(start);
for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
}
stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
}
//Timing directly the dispatch
#ifdef __HIP_PLATFORM_HCC__
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&results[i], start, stop);
}
print_timing("Timing directly single dispatch latency", results);
#endif
resultDB.DumpSummary(std::cout);
//Timing around the dispatch
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
hipEventRecord(start, 0);
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&results[i], start, stop);
}
print_timing("Timing around single dispatch latency", results);
check(hipEventDestroy(start));
check(hipEventDestroy(sync));
check(hipEventDestroy(stop));
/*********************************************************************************/
/* Batch dispatch execution latency using HIP events: */
/* Measures latency to start & finish executing each dispatch in a batch */
/*********************************************************************************/
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
hipEventRecord(start, 0);
for (int j = 0; j < BATCH_SIZE; j++) {
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&results[i], start, stop);
}
print_timing("Batch dispatch latency", results, BATCH_SIZE);
hipEventDestroy(start);
hipEventDestroy(stop);
hipCtxDestroy(context);
}
@@ -0,0 +1,24 @@
/*
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
extern "C" __global__ void test() {
}
@@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
printf("kernel Execution time = %6.3fms\n", eventMs);
//Calculate Occupancy
uint32_t numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
int numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0));
if(devProp.maxThreadsPerMultiProcessor){
std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
+4 -1
View File
@@ -312,9 +312,12 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
case hipDeviceAttributeMaxPitch:
*pi = prop->memPitch;
break;
case hipDeviceAttributeTextureAlignment:
case hipDeviceAttributeTextureAlignment:
*pi = prop->textureAlignment;
break;
case hipDeviceAttributeTexturePitchAlignment:
*pi = prop->texturePitchAlignment;
break;
case hipDeviceAttributeKernelExecTimeout:
*pi = prop->kernelExecTimeoutEnabled;
break;
+1
View File
@@ -925,6 +925,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
prop->memPitch = INT_MAX; //Maximum pitch in bytes allowed by memory copies (hardcoded 128 bytes in hipMallocPitch)
prop->textureAlignment = 0; //Alignment requirement for textures
prop->texturePitchAlignment = IMAGE_PITCH_ALIGNMENT; //Alignment requirment for texture pitch
prop->kernelExecTimeoutEnabled = 0; //no run time limit for running kernels on device
hsa_isa_t isa;
+13 -7
View File
@@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
@@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
*numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
*numBlocks = std::min(*numBlocks, (int) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
unsigned int flags)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}
hipError_t hipLaunchKernel(
+1 -1
View File
@@ -61,7 +61,7 @@ namespace hip_impl {
if (it == impl->get_globals().end())
return nullptr;
else
return it->second;
return it->second.first;
}
hsa_executable_t program_state::load_executable(const char* data,
+53 -23
View File
@@ -18,6 +18,7 @@
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_loader.h>
#include <amd_comgr.h>
#include "hc.hpp"
#include <link.h>
@@ -193,7 +194,8 @@ public:
std::tuple<
std::once_flag,
std::mutex,
std::unordered_map<std::string, void*>> globals;
// map from string to pair<global_addr, pinned_addr>
std::unordered_map<std::string, std::pair<void*, void*>>> globals;
using RAII_code_reader =
std::unique_ptr<hsa_code_object_reader_t,
@@ -308,7 +310,7 @@ public:
return symbol_addresses.second;
}
std::unordered_map<std::string, void*>& get_globals() {
std::unordered_map<std::string, std::pair<void*, void*>>& get_globals() {
std::call_once(std::get<0>(globals), [this]() {
std::get<2>(globals).reserve(get_symbol_addresses().size());
});
@@ -349,30 +351,52 @@ public:
auto& g_mutex = get_globals_mutex();
for (auto&& x : undefined_symbols) {
if (g.find(x) != g.cend()) return;
const auto it1 = get_symbol_addresses().find(x);
if (it1 == get_symbol_addresses().cend()) {
hip_throw(std::runtime_error{
"Global symbol: " + x + " is undefined."});
}
std::lock_guard<std::mutex> lck{g_mutex};
hsa_status_t status;
auto check_hsa_global_var_define_error = [&x](hsa_status_t s) {
if (s != HSA_STATUS_SUCCESS) {
const char* es;
hsa_status_string(s, &es);
hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es});
}
};
if (g.find(x) != g.cend()) return;
auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) {
const auto& global_addr = g.find(x);
if (global_addr != g.cend()) {
return global_addr->second.second;
}
return (void*)nullptr;
};
g.emplace(x, (void*)(it1->second.first));
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0,
&p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
void* p = retrieve_pinned_address_from_cache(g, x);
if (p == nullptr) {
std::lock_guard<std::mutex> lck{g_mutex};
p = retrieve_pinned_address_from_cache(g, x);
if (p == nullptr) {
if (x == "_ZN2hc13printf_bufferE") {
// This is the printf buffer, get the pinned address from HCC
p = Kalmar::getContext()->getPrintfBufferPointerVA();
}
else {
status = hsa_amd_memory_lock(reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0, &p);
check_hsa_global_var_define_error(status);
}
// cache the global address and its pinned address
g.emplace(x, std::make_pair(reinterpret_cast<void*>(it1->second.first), p));
}
}
status = hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
check_hsa_global_var_define_error(status);
}
}
@@ -398,13 +422,19 @@ public:
move(file), move(tmp));
}
hsa_code_object_reader_create_from_memory(
it->first.data(), it->first.size(), it->second.get());
auto check_hsa_error = [](hsa_status_t s) {
if (s != HSA_STATUS_SUCCESS) {
hip_throw(std::runtime_error{"error when loading code object"});
}
};
hsa_executable_load_agent_code_object(
executable, agent, *it->second, nullptr, nullptr);
check_hsa_error(hsa_code_object_reader_create_from_memory(
it->first.data(), it->first.size(), it->second.get()));
hsa_executable_freeze(executable, nullptr);
check_hsa_error(hsa_executable_load_agent_code_object(
executable, agent, *it->second, nullptr, nullptr));
check_hsa_error(hsa_executable_freeze(executable, nullptr));
}
+7 -2
View File
@@ -30,7 +30,12 @@ THE SOFTWARE.
__global__ void run_printf() { printf("Hello World\n"); }
int main() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
hipDeviceSynchronize();
int device_count = 0;
hipGetDeviceCount(&device_count);
for (int i = 0; i < device_count; ++i) {
hipSetDevice(i);
hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
hipDeviceSynchronize();
}
passed();
}
@@ -0,0 +1,212 @@
/*
Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Simple test for hipLaunchCooperativeKernelMultiDevice API.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all
* TEST: %t
* HIT_END
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <chrono>
#include "hip/hip_cooperative_groups.h"
#include "test_common.h"
using namespace std::chrono;
const static uint NumOfLoopIterrations = 16 * 1024;
const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations;
const static uint numQueues = 4;
const static uint numIter = 100;
constexpr uint NumKernelArgs = 4;
constexpr uint MaxGPUs = 8;
#include <stdio.h>
/*
namespace cg = cooperative_groups;
using namespace cooperative_groups;
*/
__global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result)
{
extern __shared__ long tmp[];
uint groups = gridDim.x;
uint group_id = blockIdx.x;
uint local_id = threadIdx.x;
uint chunk = gridDim.x * blockDim.x;
uint i = group_id * blockDim.x + local_id;
long sum = 0;
while (i < bufSize) {
sum += buf[i];
i += chunk;
}
tmp[local_id] = sum;
__syncthreads();
i = 0;
if (local_id == 0) {
sum = 0;
while (i < blockDim.x) {
sum += tmp[i];
i++;
}
tmpBuf[group_id] = sum;
}
// wait
cooperative_groups::this_grid().sync();
if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
for (uint i = 1; i < groups; ++i) {
sum += tmpBuf[i];
}
//*result = sum;
result[1 + cooperative_groups::this_multi_grid().grid_rank()] = sum;
}
cooperative_groups::this_multi_grid().sync();
if (cooperative_groups::this_multi_grid().grid_rank() == 0) {
sum = 0;
for (uint i = 1; i <= cooperative_groups::this_multi_grid().num_grids(); ++i) {
sum += result[i];
}
*result = sum;
}
}
int main() {
float *A, *B;
uint* dA[MaxGPUs];
long* dB[MaxGPUs];
long* dC;
hipModule_t Module;
hipStream_t stream[MaxGPUs];
uint32_t* init = new uint32_t[BufferSizeInDwords];
for (uint32_t i = 0; i < BufferSizeInDwords; ++i) {
init[i] = i;
}
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
size_t copySizeInDwords = BufferSizeInDwords / nGpu;
hipDeviceProp_t deviceProp[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
// Calculate the device occupancy to know how many blocks can be run concurrently
hipGetDeviceProperties(&deviceProp[i], 0);
if (!deviceProp[i].cooperativeMultiDeviceLaunch) {
printf("Device doesn't support cooperative launch!");
passed();
return 0;
}
size_t SIZE = copySizeInDwords * sizeof(uint);
HIPCHECK(hipMalloc((void**)&dA[i], SIZE));
if (i == 0) {
HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent));
}
HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice));
HIPCHECK(hipStreamCreate(&stream[i]));
}
dim3 dimBlock;
dim3 dimGrid;
dimGrid.x = 1;
dimGrid.y = 1;
dimGrid.z = 1;
dimBlock.x = 64;
dimBlock.y = 1;
dimBlock.z = 1;
int numBlocks = 0;
uint workgroups[3] = {64, 128, 256};
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
system_clock::time_point start = system_clock::now();
for (uint set = 0; set < 3; ++set) {
void* args[MaxGPUs * NumKernelArgs];
std::cout << "---------- Test#" << set << "---------------\n";
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
dimBlock.x = workgroups[set];
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
(hipFunction_t)test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n";
dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32);
HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long)));
args[i * NumKernelArgs] = (void*)&dA[i];
args[i * NumKernelArgs + 1] = (void*)&copySizeInDwords;
args[i * NumKernelArgs + 2] = (void*)&dB[i];
args[i * NumKernelArgs + 3] = (void*)&dC;
launchParamsList[i].func = reinterpret_cast<void*>(test_gws);
launchParamsList[i].gridDim = dimGrid;
launchParamsList[i].blockDim = dimBlock;
launchParamsList[i].sharedMem = dimBlock.x * sizeof(long);
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) {
std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
std::cout << "Test failed! \n";
}
for (int i = 0; i < nGpu; i++) {
hipFree(dB[i]);
}
}
system_clock::time_point end = system_clock::now();
delete [] launchParamsList;
std::chrono::duration<double> elapsed_seconds = end - start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "finished computation at " << std::ctime(&end_time) <<
"elapsed time: " << elapsed_seconds.count() << "s\n";
hipSetDevice(0);
hipFree(dC);
for (int i = 0; i < nGpu; i++) {
hipFree(dA[i]);
HIPCHECK(hipStreamDestroy(stream[i]));
}
delete [] init;
passed();
return 0;
}
@@ -116,7 +116,7 @@ int main() {
dimBlock.x = workgroups[i];
// Calculate the device occupancy to know how many blocks can be run concurrently
hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
@@ -30,10 +30,6 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>
@@ -49,11 +45,10 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
uint32_t numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
int numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0);
assert(numBlock != 0);
// test case for using kernel function pointer with template
gridSize = 0;
blockSize = 0;
@@ -61,17 +56,7 @@ int main(int argc, char* argv[]) {
assert(gridSize != 0 && blockSize != 0);
numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
assert(numBlock != 0);
// test case for using kernel with hipFunction_t type
numBlock = 0;
hipModule_t Module;
hipFunction_t Function;
HIPCHECK(hipModuleLoad(&Module, fileName));
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, (int)blockSize, 0);
assert(numBlock != 0);
passed();
@@ -33,7 +33,6 @@ THE SOFTWARE.
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>
+1 -2
View File
@@ -76,8 +76,7 @@ void texture2Dtest()
texDescr.readMode = hipReadModeElementType;
hipTextureObject_t texObj;
hipResourceViewDesc resDesc;
HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, &resDesc));
HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, NULL));
HIPCHECK(hipMalloc((void**)&devPtrB, SIZE_W*sizeof(TYPE_t)*SIZE_H)) ;