diff --git a/CMakeLists.txt b/CMakeLists.txt index 63a28813d2..75a05afc56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ endmacro() # Setup version information ############################# # Determine HIP_BASE_VERSION +set(ENV{HIP_PATH} "") execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version OUTPUT_VARIABLE HIP_BASE_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/Jenkinsfile b/Jenkinsfile index 97811bf5d9..b8bd24cd74 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -394,7 +394,7 @@ cuda_10_x: // Block of string constants customizing behavior for cuda String nvcc_ver = 'cuda-10.x' String from_image = 'ci_test_nodes/cuda-10.x/ubuntu-16.04:latest' - String inside_args = '--runtime=nvidia'; + String inside_args = '--gpus all'; // Checkout source code, dependencies and version files String source_hip_rel = checkout_and_version( nvcc_ver ) diff --git a/README.md b/README.md index 145f950fef..2bffd12162 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Key features include: * HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode. * HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas, classes, namespaces, and more. * HIP allows developers to use the "best" development environment and tools on each target platform. -* The "hipify" tool automatically converts source from CUDA to HIP. +* The [HIPIFY](hipify-clang/README.md) tools automatically convert source from CUDA to HIP. * Developers can specialize for the platform (CUDA or hcc) to tune for performance or handle tricky cases New projects can be developed directly in the portable HIP C++ language and can run on either NVIDIA or AMD platforms. Additionally, HIP provides porting tools which make it easy to port existing CUDA codes to the HIP layer, with no loss of performance as compared to the original CUDA application. HIP is not intended to be a drop-in replacement for CUDA, and developers should expect to do some manual coding and performance tuning work to complete the port. @@ -36,7 +36,7 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Profiling ](docs/markdown/hip_profiling.md) - [HIP Debugging](docs/markdown/hip_debugging.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenCL) -- [hipify-clang](hipify-clang/README.md) +- [HIPIFY](hipify-clang/README.md) - Supported CUDA APIs: * [Runtime API](docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md) * [Driver API](docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md) @@ -71,7 +71,7 @@ hipLaunchKernel(vector_square, /* compute kernel*/ dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/, /* launch config*/ C_d, A_d, N); /* arguments to the compute kernel */ -hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); +hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); ``` @@ -88,7 +88,7 @@ __global__ void vector_square(T *C_d, const T *A_d, size_t N) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t stride = hipBlockDim_x * hipGridDim_x; for (size_t i=offset; i -- [Supported CUDA APIs](#cuda-apis) -- [Dependencies](#dependencies) -- [Build and install](#build-and-install) +- [hipify-clang](#clang) + * [Dependencies](#dependencies) + * [Usage](#hipify-clang-usage) * [Building](#building) * [Testing](#testing) * [Linux](#linux) * [Windows](#windows) -- [Running and using hipify-clang](#running-and-using-hipify-clang) - * [hipify-perl](#perl) +- [hipify-perl](#perl) + * [Usage](#hipify-perl-usage) + * [Building](#hipify-perl-building) +- [Supported CUDA APIs](#cuda-apis) - [Disclaimer](#disclaimer) -## Supported CUDA APIs +## hipify-clang -- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md) -- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md) -- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md) -- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md) -- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md) -- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md) -- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md) -- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md) +`hipify-clang` is a clang-based tool for translation CUDA sources into HIP sources. +It translates CUDA source into an abstract syntax tree, which is being traversed by transformation matchers. +After applying all the matchers, the output HIP source is produced. -## Dependencies +**Advantages:** + +1. It is a translator; thus, any even very complicated constructs will be parsed successfully, or an error will be reported. +2. It supports clang options like [`-I`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-i-dir), [`-D`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-d-macro), [`--cuda-path`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-cuda-path), etc. +3. Seamless support of new CUDA versions as it is clang's responsibility. +4. Ease in support. + +**Disadvantages:** + +1. The main advantage is also the main disadvantage: the input CUDA code should be correct; incorrect code wouldn't be translated to HIP. +2. CUDA should be installed and provided in case of multiple installations by `--cuda-path` option. +3. All the includes and defines should be provided to transform code successfully. + +### hipify-clang: dependencies `hipify-clang` requires: -1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1). +1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1). 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base). +To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1). | **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** | |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:| @@ -58,21 +67,41 @@ | [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | -
not working due to
the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811)
+
[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*
| + | | [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + | | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | +
**LATEST STABLE RELEASE** | +
**LATEST STABLE RELEASE** | +| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | -`*` Download the patch and unpack it into your LLVM distributive directory; a few header files will be overwritten; rebuilding of LLVM is not needed. +`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed. -In most cases, you can get a suitable version of LLVM+CLANG with your package manager. +In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager. -Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set +Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set [CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist` -## Build and install +### hipify-clang: usage -### Build +To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang. + +For example: + +```shell +./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc +``` + +`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you +were compiling the input file. For example: + +```bash +./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17 +``` + +The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful. + +For a list of `hipify-clang` options, run `hipify-clang --help`. + +### hipify-clang: building Assuming this repository is at `./HIP`: -```shell +```bash cd hipify-clang mkdir build dist cd build @@ -84,30 +113,27 @@ cmake \ make -j install ``` -On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 15 2017` instead of `make.` +On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 16 2019` instead of `make.` +Please, see [hipify-clang: Windows](#windows) for the supported tools for building. Debug build type `-DCMAKE_BUILD_TYPE=Debug` is also supported and tested; `LLVM+CLANG` should be built in `Debug` mode as well. 64-bit build mode (`-Thost=x64` on Windows) is also supported; `LLVM+CLANG` should be built in 64-bit mode as well. The binary can then be found at `./dist/bin/hipify-clang`. -### Testing +### hipify-clang: testing -`hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html). +`hipify-clang` has unit tests using `LLVM` [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html). -**LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.** +`LLVM+CLANG` should be built from sources, pre-built binaries are not exhaustive for testing. + +**LLVM 9.0.1 or older:** -To run it: 1. download [`LLVM`](http://releases.llvm.org/9.0.1/llvm-9.0.1.src.tar.xz)+[`CLANG`](http://releases.llvm.org/9.0.1/cfe-9.0.1.src.tar.xz) sources; -2. build [`LLVM+CLANG`](http://llvm.org/docs/CMake.html): - ```shell - cd llvm - mkdir build dist - cd build - ``` +2. build [`LLVM+CLANG`](http://releases.llvm.org/9.0.0/docs/CMake.html): - - **Linux**: - ```shell + **Linux**: + ```bash cmake \ -DCMAKE_INSTALL_PREFIX=../dist \ -DLLVM_SOURCE_DIR=../llvm \ @@ -116,8 +142,7 @@ To run it: ../llvm make -j install ``` - - - **Windows**: + **Windows**: ```shell cmake \ -G "Visual Studio 16 2019" \ @@ -129,9 +154,38 @@ To run it: -Thost=x64 \ ../llvm ``` +Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`. -                Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`. +**LLVM 10.0.0 or newer:** +1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources; +2. build [`LLVM project`](http://llvm.org/docs/CMake.html): + + **Linux**: + ```bash + cmake \ + -DCMAKE_INSTALL_PREFIX=../dist \ + -DLLVM_SOURCE_DIR=../llvm-project \ + -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DCMAKE_BUILD_TYPE=Release \ + ../llvm-project/llvm + make -j install + ``` + **Windows**: + ```shell + cmake \ + -G "Visual Studio 16 2019" \ + -A x64 \ + -DCMAKE_INSTALL_PREFIX=../dist \ + -DLLVM_SOURCE_DIR=../llvm-project \ + -DLLVM_TARGETS_TO_BUILD="NVPTX" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DCMAKE_BUILD_TYPE=Release \ + -Thost=x64 \ + ../llvm-project/llvm + ``` +Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`. 3. Ensure [`CUDA`](https://developer.nvidia.com/cuda-toolkit-archive) of minimum version 7.0 is installed. @@ -161,7 +215,7 @@ To run it: 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed. -6. Ensure `lit` and `FileCheck` are installed - these are distributed with LLVM. +6. Ensure `lit` and `FileCheck` are installed - these are distributed with `LLVM`. * Install `lit` into `python`: @@ -185,23 +239,15 @@ To run it: 7. Set `HIPIFY_CLANG_TESTS` option turned on: `-DHIPIFY_CLANG_TESTS=1`. -8. Run `cmake`: - * [***Linux***](#linux) - * [***Windows***](#windows) +8. Build and run tests: -9. Run tests: - - - ***Linux***: `make test-hipify`. - - - ***Windows***: run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`. - -### Linux +### hipify-clang: Linux On Linux the following configurations are tested: Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32 -Ubuntu 16-18: LLVM 8.0.0 - 9.0.1, CUDA 8.0 - 10.1 Update 2, cudnn-5.1.10 - cudnn-7.6.5.32 +Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 Minimum build system requirements for the above configurations: @@ -209,7 +255,7 @@ Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: -```shell +```bash cmake -DHIPIFY_CLANG_TESTS=1 \ -DCMAKE_BUILD_TYPE=Release \ @@ -345,7 +391,7 @@ Testing Time: 3.07s Expected Passes : 67 [100%] Built target test-hipify ``` -### Windows +### hipify-clang: Windows On Windows 10 the following configurations are tested: @@ -353,11 +399,11 @@ LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18 LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32 -LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32 +LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32 -Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2: +Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2: -Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.2, Visual Studio 2017 (15.5.2) - 2019 (16.4.2). +Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: @@ -392,45 +438,62 @@ cmake -- Build files have been written to: f:/HIP/hipify-clang/build ``` -## Running and using hipify-clang +Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`. -To process a file, `hipify-clang` needs access to the same headers that would be needed to compile it with clang. +## hipify-perl -For example: +`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions. + +**Advantages:** + +1. Ease in use. + +2. It doesn't check the input source CUDA code for correctness. + +3. It doesn't have dependencies on 3rd party tools, including CUDA. + +**Disadvantages:** + +1. Current disability (and difficulty in implementing) of transforming the following constructs: + + * macros expansion; + + * namespaces: + + - redefines of CUDA entities in user namespaces; + + - using directive; + + * templates (some cases); + + * device/host function calls distinguishing; + + * header files correct injection; + + * complicated argument lists parsing. + +2. Difficulties in supporting. + +### hipify-perl: usage ```shell -./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc +perl hipify-perl square.cu > square.cu.hip ``` -`hipify-clang` arguments are given first, followed by a separator, and then the arguments you'd pass to `clang` if you -were compiling the input file. The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) -may be useful. +### hipify-perl: building -For a list of `hipify-clang` options, run `hipify-clang --help`. +To generate `hipify-perl`, run `hipify-clang --perl`. Output directory for the generated `hipify-perl` file might be specified by `--o-hipify-perl-dir` option. -### hipify-perl +## Supported CUDA APIs -To produce a Perl-based script `hipify-perl`, run `hipify-clang --perl`. - -The `hipify-perl` script, unlike the `hipify-clang`, being based on regular expressions, and not on an abstract syntax tree, has several gaps: - -1. macros expansion; - -2. namespaces: - - - redefines of CUDA entities in user namespaces; - - - using directive; - -3. templates (some cases); - -4. device/host function calls distinguishing; - -5. header files correct injection; - -6. complicated argument lists parsing. - -Nonetheless, `hipify-perl` is easy in use and doesn't check the input source CUDA code for correctness. +- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md) +- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md) +- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md) +- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md) +- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md) +- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md) +- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md) +- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md) ## Disclaimer @@ -438,5 +501,4 @@ The information contained herein is for informational purposes only, and is subj AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -Copyright (c) 2014-2019 Advanced Micro Devices, Inc. All rights reserved. - +Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved. diff --git a/hipify-clang/packaging/hipify-clang.txt b/hipify-clang/packaging/hipify-clang.txt index 5f78e7e67e..b189eff1e6 100644 --- a/hipify-clang/packaging/hipify-clang.txt +++ b/hipify-clang/packaging/hipify-clang.txt @@ -48,7 +48,7 @@ endif() set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda >= 7.0") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda (>= 7.0)") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV "NO") diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index dee9a25d49..75138c47ab 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -213,7 +213,7 @@ StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exp */ std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) { if (clang::isa(arg)) return "0"; - else return readSourceText(SM, arg->getSourceRange()); + else return std::string(readSourceText(SM, arg->getSourceRange())); } } // anonymous namespace @@ -427,7 +427,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchRes clang::LangOptions LO; LO.CUDA = true; clang::PrintingPolicy policy(LO); - typeName = BT->getName(policy); + typeName = std::string(BT->getName(policy)); } } else { typeName = QT.getAsString(); diff --git a/hipify-clang/src/LLVMCompat.cpp b/hipify-clang/src/LLVMCompat.cpp index 604841bcd9..f6d74121e4 100644 --- a/hipify-clang/src/LLVMCompat.cpp +++ b/hipify-clang/src/LLVMCompat.cpp @@ -44,7 +44,7 @@ ct::Replacements &getReplacements(ct::RefactoringTool &Tool, StringRef file) { #if LLVM_VERSION_MAJOR > 3 // getReplacements() now returns a map from filename to Replacements - so create an entry // for this source file and return a reference to it. - return Tool.getReplacements()[file]; + return Tool.getReplacements()[std::string(file)]; #else return Tool.getReplacements(); #endif diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp index c75c24b066..cb411eba2f 100644 --- a/hipify-clang/src/main.cpp +++ b/hipify-clang/src/main.cpp @@ -108,7 +108,7 @@ void sortInputFiles(int argc, const char **argv, std::vector &files files.assign(sortedFiles.begin(), sortedFiles.end()); } -void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath) { +void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath, const char *hipify_exe) { if (!IncludeDirs.empty()) { for (std::string s : IncludeDirs) { Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(s.c_str(), ct::ArgumentInsertPosition::BEGIN)); @@ -122,9 +122,14 @@ void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSou } } // Includes for clang's CUDA wrappers for using by packaged hipify-clang - Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include", ct::ArgumentInsertPosition::BEGIN)); + static int Dummy; + std::string hipify = llvm::sys::fs::getMainExecutable(hipify_exe, (void *)&Dummy); + std::string clang_inc_path = std::string(llvm::sys::path::parent_path(hipify)); + clang_inc_path.append("/include"); + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN)); - Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include/cuda_wrappers", ct::ArgumentInsertPosition::BEGIN)); + clang_inc_path.append("/cuda_wrappers"); + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN)); // Ensure at least c++11 is used. std::string stdCpp = "-std=c++11"; @@ -315,7 +320,7 @@ int main(int argc, const char **argv) { ct::RefactoringTool Tool(OptionsParser.getCompilations(), std::string(tmpFile.c_str())); ct::Replacements &replacementsToUse = llcompat::getReplacements(Tool, tmpFile.c_str()); ReplacementsFrontendActionFactory actionFactory(&replacementsToUse); - appendArgumentsAdjusters(Tool, sSourceAbsPath); + appendArgumentsAdjusters(Tool, sSourceAbsPath, argv[0]); Statistics ¤tStat = Statistics::current(); // Hipify _all_ the things! if (Tool.runAndSave(&actionFactory)) { diff --git a/include/hip/hcc_detail/driver_types.h b/include/hip/hcc_detail/driver_types.h index 510d3d058e..0c29542c7e 100644 --- a/include/hip/hcc_detail/driver_types.h +++ b/include/hip/hcc_detail/driver_types.h @@ -43,9 +43,10 @@ typedef struct hipChannelFormatDesc { enum hipChannelFormatKind f; }hipChannelFormatDesc; -#define HIP_TRSF_NORMALIZED_COORDINATES 0x01 -#define HIP_TRSF_READ_AS_INTEGER 0x00 #define HIP_TRSA_OVERRIDE_FORMAT 0x01 +#define HIP_TRSF_READ_AS_INTEGER 0x01 +#define HIP_TRSF_NORMALIZED_COORDINATES 0x02 +#define HIP_TRSF_SRGB 0x10 typedef enum hipArray_Format { HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index 76a04fa355..9eb738cf04 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block dynSharedMemPerBlk, blockSizeLimit); } -template -inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); -} - template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/include/hip/hcc_detail/hip_cooperative_groups_helper.h b/include/hip/hcc_detail/hip_cooperative_groups_helper.h index b74d16d23b..9738448d94 100644 --- a/include/hip/hcc_detail/hip_cooperative_groups_helper.h +++ b/include/hip/hcc_detail/hip_cooperative_groups_helper.h @@ -106,7 +106,7 @@ __CG_STATIC_QUALIFIER__ uint32_t size() { } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - // Compute global id of the workgroup to which the current threads belongs to + // Compute global id of the workgroup to which the current thread belongs to uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x) + @@ -115,7 +115,7 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { // Compute total number of threads being passed to reach current workgroup // within grid uint32_t num_threads_till_current_workgroup = - (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z)); + (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)); // Compute thread local rank within current workgroup uint32_t local_thread_rank = diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 0712db17f9..659a6c3c3a 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk); /** * @brief Returns occupancy for a device function. @@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); #if __HIP_VDI__ && !defined(__HCC__) /** @@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address, } /* extern "c" */ #endif -#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize, - dynSharedMemPerBlk); -} -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags); -} -#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) - #if USE_PROF_API #include #endif @@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk); +} + +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk, flags); +} + #if __HIP_VDI__ && !defined(__HCC__) template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index e725846cbd..64b2a85d8a 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -124,6 +124,7 @@ typedef struct hipDeviceProp_t { unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register size_t memPitch; ///memPitch = cdprop.memPitch; p_prop->textureAlignment = cdprop.textureAlignment; + p_prop->texturePitchAlignment = cdprop.texturePitchAlignment; p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled; p_prop->ECCEnabled = cdprop.ECCEnabled; p_prop->tccDriver = cdprop.tccDriver; @@ -1244,6 +1245,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeTextureAlignment: cdattr = cudaDevAttrTextureAlignment; break; + case hipDeviceAttributeTexturePitchAlignment: + cdattr = cudaDevAttrTexturePitchAlignment; + break; case hipDeviceAttributeKernelExecTimeout: cdattr = cudaDevAttrKernelExecTimeout; break; diff --git a/packaging/hip-nvcc.postinst b/packaging/hip-nvcc.postinst index 2f901324cb..b70cf2848f 100755 --- a/packaging/hip-nvcc.postinst +++ b/packaging/hip-nvcc.postinst @@ -3,6 +3,6 @@ ROCMDIR=@ROCM_PATH@ HIPDIR=$ROCMDIR/hip -if [ -d $ROCMDIR] +if [ -d $ROCMDIR] ; then ln -s -f $ROCMDIR /opt/rocm fi diff --git a/packaging/hip-nvcc.prerm b/packaging/hip-nvcc.prerm index 96875e4a9c..baa0e6f5c7 100755 --- a/packaging/hip-nvcc.prerm +++ b/packaging/hip-nvcc.prerm @@ -1,5 +1,5 @@ #!/bin/bash -if [ -L "/opt/rocm" ] +if [ -L "/opt/rocm" ] ; then unlink /opt/rocm fi diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 3b69c4a335..0616f01f0d 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -2,19 +2,18 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif -HIPCC=$(HIP_PATH)/bin/hipcc +HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11 EXE=hipDispatchLatency CXXFLAGS = -O3 -all: ${EXE} +all: test_kernel.code ${EXE} -$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp - $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ - -install: $(EXE) - cp $(EXE) $(HIP_PATH)/bin +$(EXE): hipDispatchLatency.cpp + $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@ +test_kernel.code: test_kernel.cpp + $(HIP_PATH)/bin/hipcc --genco $(GENCO_FLAGS) $^ -o $@ clean: rm -f *.o $(EXE) diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp deleted file mode 100644 index b769ca4b32..0000000000 --- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp +++ /dev/null @@ -1,473 +0,0 @@ -#include "ResultDatabase.h" - -#include -#include -#include -#include - -using namespace std; - -#define SORT_BY_NAME 0 -#define SORT_RETAIN_ATTS_ORDER 1 - - -bool ResultDatabase::Result::operator<(const Result& rhs) const { - if (test < rhs.test) return true; - if (test > rhs.test) return false; -#if (SORT_RETAIN_ATTS_ORDER == 0) - // For ties, sort by the value of the attribute: - if (atts < rhs.atts) return true; - if (atts > rhs.atts) return false; -#endif - return false; // less-operator returns false on equal -} - -double ResultDatabase::Result::GetMin() const { - double r = FLT_MAX; - for (int i = 0; i < value.size(); i++) { - r = min(r, value[i]); - } - return r; -} - -double ResultDatabase::Result::GetMax() const { - double r = -FLT_MAX; - for (int i = 0; i < value.size(); i++) { - r = max(r, value[i]); - } - return r; -} - -double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); } - -double ResultDatabase::Result::GetPercentile(double q) const { - int n = value.size(); - if (n == 0) return FLT_MAX; - if (n == 1) return value[0]; - - if (q <= 0) return value[0]; - if (q >= 100) return value[n - 1]; - - double index = ((n + 1.) * q / 100.) - 1; - - vector sorted = value; - sort(sorted.begin(), sorted.end()); - - if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.)); - - int index_lo = int(index); - double frac = index - index_lo; - if (frac == 0) return sorted[index_lo]; - - double lo = sorted[index_lo]; - double hi = sorted[index_lo + 1]; - return lo + (hi - lo) * frac; -} - -double ResultDatabase::Result::GetMean() const { - double r = 0; - for (int i = 0; i < value.size(); i++) { - r += value[i]; - } - return r / double(value.size()); -} - -double ResultDatabase::Result::GetStdDev() const { - double r = 0; - double u = GetMean(); - if (u == FLT_MAX) return FLT_MAX; - for (int i = 0; i < value.size(); i++) { - r += (value[i] - u) * (value[i] - u); - } - r = sqrt(r / value.size()); - return r; -} - - -void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit, - const vector& values) { - for (int i = 0; i < values.size(); i++) { - AddResult(test, atts, unit, values[i]); - } -} - -static string RemoveAllButLeadingSpaces(const string& a) { - string b; - int n = a.length(); - int i = 0; - while (i < n && a[i] == ' ') { - b += a[i]; - ++i; - } - for (; i < n; i++) { - if (a[i] != ' ' && a[i] != '\t') b += a[i]; - } - return b; -} - -void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig, - const string& unit_orig, double value) { - string test = RemoveAllButLeadingSpaces(test_orig); - string atts = RemoveAllButLeadingSpaces(atts_orig); - string unit = RemoveAllButLeadingSpaces(unit_orig); - int index; - for (index = 0; index < results.size(); index++) { - if (results[index].test == test && results[index].atts == atts) { - if (results[index].unit != unit) throw "Internal error: mixed units"; - - break; - } - } - - if (index >= results.size()) { - Result r; - r.test = test; - r.atts = atts; - r.unit = unit; - results.push_back(r); - } - - results[index].value.push_back(value); -} - -// **************************************************************************** -// Method: ResultDatabase::DumpDetailed -// -// Purpose: -// Writes the full results, including all trials. -// -// Arguments: -// out where to print -// -// Programmer: Jeremy Meredith -// Creation: August 14, 2009 -// -// Modifications: -// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 -// Renamed to DumpDetailed to make room for a DumpSummary. -// -// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 -// Added note about (*) missing value tag. -// -// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 -// Changed note about missing values to be worded a little better. -// -// **************************************************************************** -void ResultDatabase::DumpDetailed(ostream& out) { - vector sorted(results); - -#if SORT_BY_NAME - stable_sort(sorted.begin(), sorted.end()); -#endif - - const int testNameW = 24; - const int attW = 12; - const int fieldW = 11; - out << std::fixed << right << std::setprecision(4); - - int maxtrials = 1; - for (int i = 0; i < sorted.size(); i++) { - if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size(); - } - - // TODO: in big parallel runs, the "trials" are the procs - // and we really don't want to print them all out.... - out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t" - << "mean\t" - << "stddev\t" - << "min\t" - << "max\t"; - for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t"; - out << endl; - - for (int i = 0; i < sorted.size(); i++) { - Result& r = sorted[i]; - out << setw(testNameW) << r.test + "\t"; - out << setw(attW) << r.atts + "\t"; - out << setw(fieldW) << r.unit + "\t"; - if (r.GetMedian() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMedian() << "\t"; - if (r.GetMean() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMean() << "\t"; - if (r.GetStdDev() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetStdDev() << "\t"; - if (r.GetMin() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMin() << "\t"; - if (r.GetMax() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMax() << "\t"; - for (int j = 0; j < r.value.size(); j++) { - if (r.value[j] == FLT_MAX) - out << "N/A\t"; - else - out << r.value[j] << "\t"; - } - - out << endl; - } - if (0) { - out << endl - << "Note: Any results marked with (*) had missing values." << endl - << " This can occur on systems with a mixture of" << endl - << " device types or architectural capabilities." << endl; - } -} - - -// **************************************************************************** -// Method: ResultDatabase::DumpDetailed -// -// Purpose: -// Writes the summary results (min/max/stddev/med/mean), but not -// every individual trial. -// -// Arguments: -// out where to print -// -// Programmer: Jeremy Meredith -// Creation: November 10, 2010 -// -// Modifications: -// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 -// Added note about (*) missing value tag. -// -// **************************************************************************** -void ResultDatabase::DumpSummary(ostream& out) { - vector sorted(results); - -#if SORT_BY_NAME - stable_sort(sorted.begin(), sorted.end()); -#endif - - const int testNameW = 32; - const int attW = 12; - const int fieldW = 9; - out << std::fixed << right << std::setprecision(2); - - // TODO: in big parallel runs, the "trials" are the procs - // and we really don't want to print them all out.... - out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t" - << "median\t" - << "mean\t" - << "stddev\t" - << "min\t" - << "max\t"; - out << endl; - - for (int i = 0; i < sorted.size(); i++) { - Result& r = sorted[i]; - out << setw(testNameW) << r.test + "\t"; - out << setw(attW) << r.atts + "\t"; - out << setw(fieldW) << r.unit + "\t"; - if (r.GetMedian() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMedian() << "\t"; - if (r.GetMean() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMean() << "\t"; - if (r.GetStdDev() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetStdDev() << "\t"; - if (r.GetMin() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMin() << "\t"; - if (r.GetMax() == FLT_MAX) - out << "N/A\t"; - else - out << r.GetMax() << "\t"; - - out << endl; - } - if (0) { - out << endl - << "Note: results marked with (*) had missing values such as" << endl - << "might occur with a mixture of architectural capabilities." << endl; - } -} - -// **************************************************************************** -// Method: ResultDatabase::ClearAllResults -// -// Purpose: -// Clears all existing results from the ResultDatabase; used for multiple passes -// of the same test or multiple tests. -// -// Arguments: -// -// Programmer: Jeffrey Young -// Creation: September 10th, 2014 -// -// Modifications: -// -// -// **************************************************************************** -void ResultDatabase::ClearAllResults() { results.clear(); } - -// **************************************************************************** -// Method: ResultDatabase::DumpCsv -// -// Purpose: -// Writes either detailed or summary results (min/max/stddev/med/mean), but not -// every individual trial. -// -// Arguments: -// out file to print CSV results -// -// Programmer: Jeffrey Young -// Creation: August 28th, 2014 -// -// Modifications: -// -// **************************************************************************** -void ResultDatabase::DumpCsv(string fileName) { - bool emptyFile; - vector sorted(results); - -#if SORT_BY_NAME - stable_sort(sorted.begin(), sorted.end()); -#endif - - // Check to see if the file is empty - if so, add the headers - emptyFile = this->IsFileEmpty(fileName); - - // Open file and append by default - ofstream out; - out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); - - // Add headers only for empty files - if (emptyFile) { - // TODO: in big parallel runs, the "trials" are the procs - // and we really don't want to print them all out.... - out << "test, " - << "atts, " - << "units, " - << "median, " - << "mean, " - << "stddev, " - << "min, " - << "max, "; - out << endl; - } - - for (int i = 0; i < sorted.size(); i++) { - Result& r = sorted[i]; - out << r.test << ", "; - out << r.atts << ", "; - out << r.unit << ", "; - if (r.GetMedian() == FLT_MAX) - out << "N/A, "; - else - out << r.GetMedian() << ", "; - if (r.GetMean() == FLT_MAX) - out << "N/A, "; - else - out << r.GetMean() << ", "; - if (r.GetStdDev() == FLT_MAX) - out << "N/A, "; - else - out << r.GetStdDev() << ", "; - if (r.GetMin() == FLT_MAX) - out << "N/A, "; - else - out << r.GetMin() << ", "; - if (r.GetMax() == FLT_MAX) - out << "N/A, "; - else - out << r.GetMax() << ", "; - - out << endl; - } - out << endl; - - out.close(); -} - -// **************************************************************************** -// Method: ResultDatabase::IsFileEmpty -// -// Purpose: -// Returns whether a file is empty - used as a helper for CSV printing -// -// Arguments: -// file The input file to check for emptiness -// -// Programmer: Jeffrey Young -// Creation: August 28th, 2014 -// -// Modifications: -// -// **************************************************************************** - -bool ResultDatabase::IsFileEmpty(string fileName) { - bool fileEmpty; - - ifstream file(fileName.c_str()); - - // If the file doesn't exist it is by definition empty - if (!file.good()) { - return true; - } else { - fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof()); - file.close(); - - return fileEmpty; - } - - // Otherwise, return false - return false; -} - - -// **************************************************************************** -// Method: ResultDatabase::GetResultsForTest -// -// Purpose: -// Returns a vector of results for just one test name. -// -// Arguments: -// test the name of the test results to search for -// -// Programmer: Jeremy Meredith -// Creation: December 3, 2010 -// -// Modifications: -// -// **************************************************************************** -vector ResultDatabase::GetResultsForTest(const string& test) { - // get only the given test results - vector retval; - for (int i = 0; i < results.size(); i++) { - Result& r = results[i]; - if (r.test == test) retval.push_back(r); - } - return retval; -} - -// **************************************************************************** -// Method: ResultDatabase::GetResults -// -// Purpose: -// Returns all the results. -// -// Arguments: -// -// Programmer: Jeremy Meredith -// Creation: December 3, 2010 -// -// Modifications: -// -// **************************************************************************** -const vector& ResultDatabase::GetResults() const { return results; } diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.h b/samples/1_Utils/hipDispatchLatency/ResultDatabase.h deleted file mode 100644 index ca6a00fc91..0000000000 --- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef RESULT_DATABASE_H -#define RESULT_DATABASE_H - -#include -#include -#include -#include -#include -using std::ifstream; -using std::ofstream; -using std::ostream; -using std::string; -using std::vector; - - -// **************************************************************************** -// Class: ResultDatabase -// -// Purpose: -// Track numerical results as they are generated. -// Print statistics of raw results. -// -// Programmer: Jeremy Meredith -// Creation: June 12, 2009 -// -// Modifications: -// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 -// Split timing reports into detailed and summary. E.g. for serial code, -// we might report all trial values, but skip them in parallel. -// -// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 -// Added check for missing value tag. -// -// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 -// Added percentile statistic. -// -// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 -// Added a method to extract a subset of results based on test name. Also, -// the Result class is now public, so that clients can use them directly. -// Added a GetResults method as well, and made several functions const. -// -// **************************************************************************** -class ResultDatabase { - public: - // - // A performance result for a single SHOC benchmark run. - // - struct Result { - string test; // e.g. "readback" - string atts; // e.g. "pagelocked 4k^2" - string unit; // e.g. "MB/sec" - vector value; // e.g. "837.14" - double GetMin() const; - double GetMax() const; - double GetMedian() const; - double GetPercentile(double q) const; - double GetMean() const; - double GetStdDev() const; - - bool operator<(const Result& rhs) const; - - bool HadAnyFLTMAXValues() const { - for (int i = 0; i < value.size(); ++i) { - if (value[i] >= FLT_MAX) return true; - } - return false; - } - }; - - protected: - vector results; - - public: - void AddResult(const string& test, const string& atts, const string& unit, double value); - void AddResults(const string& test, const string& atts, const string& unit, - const vector& values); - vector GetResultsForTest(const string& test); - const vector& GetResults() const; - void ClearAllResults(); - void DumpDetailed(ostream&); - void DumpSummary(ostream&); - void DumpCsv(string fileName); - - private: - bool IsFileEmpty(string fileName); -}; - - -#endif diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 9d384c7d6a..625d8cd742 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -1,16 +1,13 @@ /* Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -21,142 +18,134 @@ THE SOFTWARE. */ #include "hip/hip_runtime.h" +#ifdef __HIP_PLATFORM_HCC__ +#include "hip/hip_ext.h" +#endif #include -#include -#include "ResultDatabase.h" - -#define PRINT_PROGRESS 0 - -#define check(cmd) \ - { \ - hipError_t status = cmd; \ - if (status != hipSuccess) { \ - printf("error: '%s'(%d) from %s at %s:%d\n", hipGetErrorString(status), status, #cmd, \ - __FILE__, __LINE__); \ - abort(); \ - } \ - } - -#define LEN 1024 * 1024 +#include +#include #define NUM_GROUPS 1 -#define GROUP_SIZE 64 -#define TEST_ITERS 20 -#define DISPATCHES_PER_TEST 100 +#define GROUP_SIZE 1 +#define WARMUP_RUN_COUNT 10 +#define TIMING_RUN_COUNT 100 +#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT +#define BATCH_SIZE 1000 -const unsigned p_tests = 0xfffffff; +#define FILE_NAME "test_kernel.code" +#define KERNEL_NAME "test" +__global__ void EmptyKernel() { } -// HCC optimizes away fully NULL kernel calls, so run one that is nearly null: -__global__ void NearlyNull(float* Ad) { - if (Ad) { - Ad[0] = 42; - } +void print_timing(std::string test, const std::array &results, int batch = 1) { + + float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f; + + // skip warm-up runs + auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT); + auto end_iter = results.end(); + + // mean + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + total_us += (run_ms * 1000) / batch; + }); + mean_us = total_us / TIMING_RUN_COUNT; + + // stddev + total_us = 0; + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + float dev_us = ((run_ms * 1000) / batch) - mean_us; + total_us += dev_us * dev_us; + }); + stddev_us = sqrt(total_us / TIMING_RUN_COUNT); + + // display + printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us); } - -ResultDatabase resultDB; - - -void stopTest(hipEvent_t start, hipEvent_t stop, const char* msg, int iters) { - float mS = 0; - check(hipEventRecord(stop)); - check(hipDeviceSynchronize()); - check(hipEventElapsedTime(&mS, start, stop)); - resultDB.AddResult(std::string(msg), "", "uS", mS * 1000 / iters); - if (PRINT_PROGRESS & 0x1) { - std::cout << msg << "\t\t" << mS * 1000 / iters << " uS" << std::endl; - } - if (PRINT_PROGRESS & 0x2) { - resultDB.DumpSummary(std::cout); - } -} - - -int main() { - hipError_t err; - float* Ad; - check(hipMalloc(&Ad, 4)); - - - hipStream_t stream; - check(hipStreamCreate(&stream)); - - - hipEvent_t start, sync, stop; - check(hipEventCreate(&start)); - check(hipEventCreateWithFlags(&sync, hipEventBlockingSync)); - check(hipEventCreate(&stop)); - - +int main() { hipStream_t stream0 = 0; + hipDevice_t device; + hipDeviceGet(&device, 0); + hipCtx_t context; + hipCtxCreate(&context, 0, device); + hipModule_t module; + hipFunction_t function; + hipModuleLoad(&module, FILE_NAME); + hipModuleGetFunction(&function, module, KERNEL_NAME); + void* params = nullptr; + + std::array results; + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + /************************************************************************************/ + /* HIP kernel launch enqueue rate: */ + /* Measure time taken to enqueue a kernel on the GPU */ + /************************************************************************************/ - if (p_tests & 0x1) { - hipEventRecord(start); - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad); - stopTest(start, stop, "FirstKernelLaunch", 1); + // Timing hipModuleLaunchKernel + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, ¶ms, nullptr); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); } + print_timing("hipModuleLaunchKernel enqueue rate", results); - - if (p_tests & 0x2) { - hipEventRecord(start); - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad); - stopTest(start, stop, "SecondKernelLaunch", 1); + // Timing hipLaunchKernelGGL + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); } + print_timing("hipLaunchKernelGGL enqueue rate", results); + /***********************************************************************************/ + /* Single dispatch execution latency using HIP events: */ + /* Measures latency to start & finish executing a kernel with GPU-scope visibility */ + /***********************************************************************************/ - if (p_tests & 0x4) { - for (int t = 0; t < TEST_ITERS; t++) { - hipEventRecord(start); - for (int i = 0; i < DISPATCHES_PER_TEST; i++) { - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad); - hipEventRecord(sync); - hipEventSynchronize(sync); - } - stopTest(start, stop, "NullStreamASyncDispatchWait", DISPATCHES_PER_TEST); - } - } - - - if (p_tests & 0x10) { - for (int t = 0; t < TEST_ITERS; t++) { - hipEventRecord(start); - for (int i = 0; i < DISPATCHES_PER_TEST; i++) { - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad); - hipEventRecord(sync); - hipEventSynchronize(sync); - } - stopTest(start, stop, "StreamASyncDispatchWait", DISPATCHES_PER_TEST); - } - } - -#if 1 - - if (p_tests & 0x40) { - for (int t = 0; t < TEST_ITERS; t++) { - hipEventRecord(start); - for (int i = 0; i < DISPATCHES_PER_TEST; i++) { - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad); - } - stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST); - } - } - - if (p_tests & 0x80) { - for (int t = 0; t < TEST_ITERS; t++) { - hipEventRecord(start); - for (int i = 0; i < DISPATCHES_PER_TEST; i++) { - hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad); - } - stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST); - } + //Timing directly the dispatch +#ifdef __HIP_PLATFORM_HCC__ + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0); + hipEventSynchronize(stop); + hipEventElapsedTime(&results[i], start, stop); } + print_timing("Timing directly single dispatch latency", results); #endif - resultDB.DumpSummary(std::cout); + //Timing around the dispatch + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + hipEventRecord(start, 0); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + hipEventElapsedTime(&results[i], start, stop); + } + print_timing("Timing around single dispatch latency", results); - check(hipEventDestroy(start)); - check(hipEventDestroy(sync)); - check(hipEventDestroy(stop)); + /*********************************************************************************/ + /* Batch dispatch execution latency using HIP events: */ + /* Measures latency to start & finish executing each dispatch in a batch */ + /*********************************************************************************/ + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + hipEventRecord(start, 0); + for (int j = 0; j < BATCH_SIZE; j++) { + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + hipEventElapsedTime(&results[i], start, stop); + } + print_timing("Batch dispatch latency", results, BATCH_SIZE); + + hipEventDestroy(start); + hipEventDestroy(stop); + hipCtxDestroy(context); } + diff --git a/samples/1_Utils/hipDispatchLatency/test_kernel.cpp b/samples/1_Utils/hipDispatchLatency/test_kernel.cpp new file mode 100644 index 0000000000..23ef426730 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/test_kernel.cpp @@ -0,0 +1,24 @@ +/* +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" + +extern "C" __global__ void test() { +} + diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..605c7724b2 100644 --- a/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); + int numBlock = 0; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl; diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 403194483a..aa89e62271 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -312,9 +312,12 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device case hipDeviceAttributeMaxPitch: *pi = prop->memPitch; break; - case hipDeviceAttributeTextureAlignment: + case hipDeviceAttributeTextureAlignment: *pi = prop->textureAlignment; break; + case hipDeviceAttributeTexturePitchAlignment: + *pi = prop->texturePitchAlignment; + break; case hipDeviceAttributeKernelExecTimeout: *pi = prop->kernelExecTimeoutEnabled; break; diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 63bc8fe14f..175d301ee1 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -925,6 +925,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) { prop->memPitch = INT_MAX; //Maximum pitch in bytes allowed by memory copies (hardcoded 128 bytes in hipMallocPitch) prop->textureAlignment = 0; //Alignment requirement for textures + prop->texturePitchAlignment = IMAGE_PITCH_ALIGNMENT; //Alignment requirment for texture pitch prop->kernelExecTimeoutEnabled = 0; //no run time limit for running kernels on device hsa_isa_t isa; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index b11197703f..a8255ea725 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block } hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) { using namespace hip_impl; @@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); } hipError_t hipLaunchKernel( diff --git a/src/program_state.cpp b/src/program_state.cpp index dbd7d3ebc4..5e9f9976be 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -61,7 +61,7 @@ namespace hip_impl { if (it == impl->get_globals().end()) return nullptr; else - return it->second; + return it->second.first; } hsa_executable_t program_state::load_executable(const char* data, diff --git a/src/program_state.inl b/src/program_state.inl index 993418de96..fcc1eb762a 100644 --- a/src/program_state.inl +++ b/src/program_state.inl @@ -18,6 +18,7 @@ #include #include #include +#include "hc.hpp" #include @@ -193,7 +194,8 @@ public: std::tuple< std::once_flag, std::mutex, - std::unordered_map> globals; + // map from string to pair + std::unordered_map>> globals; using RAII_code_reader = std::unique_ptr& get_globals() { + std::unordered_map>& get_globals() { std::call_once(std::get<0>(globals), [this]() { std::get<2>(globals).reserve(get_symbol_addresses().size()); }); @@ -349,30 +351,52 @@ public: auto& g_mutex = get_globals_mutex(); for (auto&& x : undefined_symbols) { - if (g.find(x) != g.cend()) return; - const auto it1 = get_symbol_addresses().find(x); - if (it1 == get_symbol_addresses().cend()) { hip_throw(std::runtime_error{ "Global symbol: " + x + " is undefined."}); } - std::lock_guard lck{g_mutex}; + hsa_status_t status; + auto check_hsa_global_var_define_error = [&x](hsa_status_t s) { + if (s != HSA_STATUS_SUCCESS) { + const char* es; + hsa_status_string(s, &es); + hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es}); + } + }; - if (g.find(x) != g.cend()) return; + auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) { + const auto& global_addr = g.find(x); + if (global_addr != g.cend()) { + return global_addr->second.second; + } + return (void*)nullptr; + }; - g.emplace(x, (void*)(it1->second.first)); - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(it1->second.first), - it1->second.second, - nullptr, // All agents. - 0, - &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); + void* p = retrieve_pinned_address_from_cache(g, x); + if (p == nullptr) { + std::lock_guard lck{g_mutex}; + p = retrieve_pinned_address_from_cache(g, x); + if (p == nullptr) { + if (x == "_ZN2hc13printf_bufferE") { + // This is the printf buffer, get the pinned address from HCC + p = Kalmar::getContext()->getPrintfBufferPointerVA(); + } + else { + status = hsa_amd_memory_lock(reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, &p); + check_hsa_global_var_define_error(status); + } + // cache the global address and its pinned address + g.emplace(x, std::make_pair(reinterpret_cast(it1->second.first), p)); + } + } + status = hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + check_hsa_global_var_define_error(status); } } @@ -398,13 +422,19 @@ public: move(file), move(tmp)); } - hsa_code_object_reader_create_from_memory( - it->first.data(), it->first.size(), it->second.get()); + auto check_hsa_error = [](hsa_status_t s) { + if (s != HSA_STATUS_SUCCESS) { + hip_throw(std::runtime_error{"error when loading code object"}); + } + }; - hsa_executable_load_agent_code_object( - executable, agent, *it->second, nullptr, nullptr); + check_hsa_error(hsa_code_object_reader_create_from_memory( + it->first.data(), it->first.size(), it->second.get())); - hsa_executable_freeze(executable, nullptr); + check_hsa_error(hsa_executable_load_agent_code_object( + executable, agent, *it->second, nullptr, nullptr)); + + check_hsa_error(hsa_executable_freeze(executable, nullptr)); } diff --git a/tests/src/kernel/hipPrintfKernel.cpp b/tests/src/kernel/hipPrintfKernel.cpp index 1d4fa5fe30..5675f2e6bd 100644 --- a/tests/src/kernel/hipPrintfKernel.cpp +++ b/tests/src/kernel/hipPrintfKernel.cpp @@ -30,7 +30,12 @@ THE SOFTWARE. __global__ void run_printf() { printf("Hello World\n"); } int main() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0); - hipDeviceSynchronize(); + int device_count = 0; + hipGetDeviceCount(&device_count); + for (int i = 0; i < device_count; ++i) { + hipSetDevice(i); + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0); + hipDeviceSynchronize(); + } passed(); } diff --git a/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp new file mode 100644 index 0000000000..ea0f41d631 --- /dev/null +++ b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp @@ -0,0 +1,212 @@ +/* +Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Simple test for hipLaunchCooperativeKernelMultiDevice API. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all + * TEST: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include +#include +#include +#include +#include +#include "hip/hip_cooperative_groups.h" +#include "test_common.h" + +using namespace std::chrono; + +const static uint NumOfLoopIterrations = 16 * 1024; +const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations; +const static uint numQueues = 4; +const static uint numIter = 100; +constexpr uint NumKernelArgs = 4; +constexpr uint MaxGPUs = 8; + +#include +/* +namespace cg = cooperative_groups; +using namespace cooperative_groups; +*/ + +__global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result) +{ + extern __shared__ long tmp[]; + uint groups = gridDim.x; + uint group_id = blockIdx.x; + uint local_id = threadIdx.x; + uint chunk = gridDim.x * blockDim.x; + + uint i = group_id * blockDim.x + local_id; + long sum = 0; + while (i < bufSize) { + sum += buf[i]; + i += chunk; + } + tmp[local_id] = sum; + __syncthreads(); + i = 0; + if (local_id == 0) { + sum = 0; + while (i < blockDim.x) { + sum += tmp[i]; + i++; + } + tmpBuf[group_id] = sum; + } + + // wait + cooperative_groups::this_grid().sync(); + + if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) { + for (uint i = 1; i < groups; ++i) { + sum += tmpBuf[i]; + } + //*result = sum; + result[1 + cooperative_groups::this_multi_grid().grid_rank()] = sum; + } + cooperative_groups::this_multi_grid().sync(); + if (cooperative_groups::this_multi_grid().grid_rank() == 0) { + sum = 0; + for (uint i = 1; i <= cooperative_groups::this_multi_grid().num_grids(); ++i) { + sum += result[i]; + } + *result = sum; + } +} + +int main() { + float *A, *B; + uint* dA[MaxGPUs]; + long* dB[MaxGPUs]; + long* dC; + hipModule_t Module; + hipStream_t stream[MaxGPUs]; + + uint32_t* init = new uint32_t[BufferSizeInDwords]; + for (uint32_t i = 0; i < BufferSizeInDwords; ++i) { + init[i] = i; + } + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + size_t copySizeInDwords = BufferSizeInDwords / nGpu; + hipDeviceProp_t deviceProp[MaxGPUs]; + + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + // Calculate the device occupancy to know how many blocks can be run concurrently + hipGetDeviceProperties(&deviceProp[i], 0); + if (!deviceProp[i].cooperativeMultiDeviceLaunch) { + printf("Device doesn't support cooperative launch!"); + passed(); + return 0; + } + size_t SIZE = copySizeInDwords * sizeof(uint); + + HIPCHECK(hipMalloc((void**)&dA[i], SIZE)); + if (i == 0) { + HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent)); + } + HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipStreamCreate(&stream[i])); + } + + dim3 dimBlock; + dim3 dimGrid; + dimGrid.x = 1; + dimGrid.y = 1; + dimGrid.z = 1; + dimBlock.x = 64; + dimBlock.y = 1; + dimBlock.z = 1; + + int numBlocks = 0; + uint workgroups[3] = {64, 128, 256}; + + hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; + + system_clock::time_point start = system_clock::now(); + + for (uint set = 0; set < 3; ++set) { + void* args[MaxGPUs * NumKernelArgs]; + std::cout << "---------- Test#" << set << "---------------\n"; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + dimBlock.x = workgroups[set]; + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, + (hipFunction_t)test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + + std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n"; + + dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32); + HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long))); + + args[i * NumKernelArgs] = (void*)&dA[i]; + args[i * NumKernelArgs + 1] = (void*)©SizeInDwords; + args[i * NumKernelArgs + 2] = (void*)&dB[i]; + args[i * NumKernelArgs + 3] = (void*)&dC; + + launchParamsList[i].func = reinterpret_cast(test_gws); + launchParamsList[i].gridDim = dimGrid; + launchParamsList[i].blockDim = dimBlock; + launchParamsList[i].sharedMem = dimBlock.x * sizeof(long); + launchParamsList[i].stream = stream[i]; + launchParamsList[i].args = &args[i * NumKernelArgs]; + } + + hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0); + + HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost)); + + if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) { + std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"; + std::cout << "Test failed! \n"; + } + for (int i = 0; i < nGpu; i++) { + hipFree(dB[i]); + } + } + system_clock::time_point end = system_clock::now(); + + delete [] launchParamsList; + + std::chrono::duration elapsed_seconds = end - start; + + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + + std::cout << "finished computation at " << std::ctime(&end_time) << + "elapsed time: " << elapsed_seconds.count() << "s\n"; + + hipSetDevice(0); + hipFree(dC); + for (int i = 0; i < nGpu; i++) { + hipFree(dA[i]); + HIPCHECK(hipStreamDestroy(stream[i])); + } + delete [] init; + passed(); + return 0; +} diff --git a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index c76685fa89..896738892d 100644 --- a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -116,7 +116,7 @@ int main() { dimBlock.x = workgroups[i]; // Calculate the device occupancy to know how many blocks can be run concurrently - hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp similarity index 80% rename from tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..8e0dd033bc 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -49,11 +45,10 @@ int main(int argc, char* argv[]) { hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; - hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); + int numBlock = 0; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0); assert(numBlock != 0); - // test case for using kernel function pointer with template gridSize = 0; blockSize = 0; @@ -61,17 +56,7 @@ int main(int argc, char* argv[]) { assert(gridSize != 0 && blockSize != 0); numBlock = 0; - hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); - assert(numBlock != 0); - - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, (int)blockSize, 0); assert(numBlock != 0); passed(); diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp similarity index 99% rename from tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..d29100d9a9 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp @@ -33,7 +33,6 @@ THE SOFTWARE. #define fileName "vcpy_kernel.code" #define kernel_name "hello_world" - __global__ void f1(float *a) { *a = 1.0; } template diff --git a/tests/src/texture/hipTexObjPitch.cpp b/tests/src/texture/hipTexObjPitch.cpp index 47648d5e73..b11e7408ae 100644 --- a/tests/src/texture/hipTexObjPitch.cpp +++ b/tests/src/texture/hipTexObjPitch.cpp @@ -76,8 +76,7 @@ void texture2Dtest() texDescr.readMode = hipReadModeElementType; hipTextureObject_t texObj; - hipResourceViewDesc resDesc; - HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, &resDesc)); + HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, NULL)); HIPCHECK(hipMalloc((void**)&devPtrB, SIZE_W*sizeof(TYPE_t)*SIZE_H)) ;