Merge branch 'master' into amd-master-next

Conflicts:
	CMakeLists.txt
	tests/src/texture/simpleTexture2DLayered.cpp
	tests/src/texture/simpleTexture3D.cpp

Change-Id: I4aa4754d391b5f37ddf15fa0bcfc84d9da020119
Tá an tiomantas seo le fáil i:
Lad, Aditya
2020-03-06 14:10:04 -05:00
tuismitheoir 1216153c66 cf61ad2830
tiomantas e7fdb3d796
D'athraigh 42 comhad le 1410 breiseanna agus 640 scriosta
+1
Féach ar an gComhad
@@ -377,6 +377,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
target_link_libraries(hip_hcc_static PRIVATE hc_am)
add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY)
set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" )
set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" )
+5 -5
Féach ar an gComhad
@@ -177,7 +177,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
cd ${build_dir_rel}
make install -j\$(nproc)
make build_tests -i -j\$(nproc)
ctest -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)"
ctest --output-on-failure -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)"
"""
// If unit tests output a junit or xunit file in the future, jenkins can parse that file
// to display test results on the dashboard
@@ -295,13 +295,13 @@ def docker_upload_dockerhub( String local_org, String image_name, String remote_
String build_config = 'Release'
String job_name = env.JOB_NAME.toLowerCase( )
// The following launches 3 builds in parallel: rocm-head, rocm-3.0.x and cuda-10.x
parallel rocm_3_0:
// The following launches 3 builds in parallel: rocm-head, rocm-3.1.x and cuda-10.x
parallel rocm_3_1:
{
node('hip-rocm')
{
String hcc_ver = 'rocm-3.0.x'
String from_image = 'ci_test_nodes/rocm-3.0.x/ubuntu-16.04:latest'
String hcc_ver = 'rocm-3.1.x'
String from_image = 'ci_test_nodes/rocm-3.1.x/ubuntu-16.04:latest'
String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'
// Checkout source code, dependencies and version files
+6
Féach ar an gComhad
@@ -726,6 +726,12 @@ if ($HIP_PLATFORM eq "clang") {
$HIPCXXFLAGS .= " -O3";
$HIPLDFLAGS .= " -O3";
}
if ($optArg ne "-O0") {
$HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
if ($needLDFLAGS and not $needCXXFLAGS) {
$HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
}
}
$HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH";
$HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
if (not $isWindows) {
+1 -1
Féach ar an gComhad
@@ -1,7 +1,7 @@
#!/usr/bin/perl -w
$HIP_BASE_VERSION_MAJOR = "3";
$HIP_BASE_VERSION_MINOR = "1";
$HIP_BASE_VERSION_MINOR = "2";
# Need perl > 5.10 to use logic-defined or
use 5.006; use v5.10.1;
+8
Féach ar an gComhad
@@ -1172,6 +1172,7 @@ sub simpleSubstitutions {
$ft{'numeric_literal'} += s/\bCUDA_ERROR_ASSERT\b/hipErrorAssert/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_CURRENT\b/hipErrorContextAlreadyCurrent/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_IN_USE\b/hipErrorContextAlreadyInUse/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE\b/hipErrorCooperativeLaunchTooLarge/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_DEINITIALIZED\b/hipErrorDeinitialized/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_ECC_UNCORRECTABLE\b/hipErrorECCNotCorrectable/g;
$ft{'numeric_literal'} += s/\bCUDA_ERROR_FILE_NOT_FOUND\b/hipErrorFileNotFound/g;
@@ -1528,6 +1529,7 @@ sub simpleSubstitutions {
$ft{'numeric_literal'} += s/\bCU_MEMORYTYPE_UNIFIED\b/hipMemoryTypeUnified/g;
$ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_GLOBAL\b/hipMemAttachGlobal/g;
$ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_HOST\b/hipMemAttachHost/g;
$ft{'numeric_literal'} += s/\bCU_OCCUPANCY_DEFAULT\b/hipOccupancyDefault/g;
$ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_ARRAY\b/hipResourceTypeArray/g;
$ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_LINEAR\b/hipResourceTypeLinear/g;
$ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_MIPMAPPED_ARRAY\b/hipResourceTypeMipmappedArray/g;
@@ -1636,6 +1638,7 @@ sub simpleSubstitutions {
$ft{'numeric_literal'} += s/\bcudaErrorAlreadyMapped\b/hipErrorAlreadyMapped/g;
$ft{'numeric_literal'} += s/\bcudaErrorArrayIsMapped\b/hipErrorArrayIsMapped/g;
$ft{'numeric_literal'} += s/\bcudaErrorAssert\b/hipErrorAssert/g;
$ft{'numeric_literal'} += s/\bcudaErrorCooperativeLaunchTooLarge\b/hipErrorCooperativeLaunchTooLarge/g;
$ft{'numeric_literal'} += s/\bcudaErrorCudartUnloading\b/hipErrorDeinitialized/g;
$ft{'numeric_literal'} += s/\bcudaErrorDeviceAlreadyInUse\b/hipErrorContextAlreadyInUse/g;
$ft{'numeric_literal'} += s/\bcudaErrorDeviceUninitialized\b/hipErrorInvalidContext/g;
@@ -1749,6 +1752,8 @@ sub simpleSubstitutions {
$ft{'define'} += s/\bCUDA_ARRAY3D_LAYERED\b/hipArrayLayered/g;
$ft{'define'} += s/\bCUDA_ARRAY3D_SURFACE_LDST\b/hipArraySurfaceLoadStore/g;
$ft{'define'} += s/\bCUDA_ARRAY3D_TEXTURE_GATHER\b/hipArrayTextureGather/g;
$ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPostSync/g;
$ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPreSync/g;
$ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_POINTER\b/HIP_LAUNCH_PARAM_BUFFER_POINTER/g;
$ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_SIZE\b/HIP_LAUNCH_PARAM_BUFFER_SIZE/g;
$ft{'define'} += s/\bCU_LAUNCH_PARAM_END\b/HIP_LAUNCH_PARAM_END/g;
@@ -1769,6 +1774,8 @@ sub simpleSubstitutions {
$ft{'define'} += s/\bcudaArrayLayered\b/hipArrayLayered/g;
$ft{'define'} += s/\bcudaArraySurfaceLoadStore\b/hipArraySurfaceLoadStore/g;
$ft{'define'} += s/\bcudaArrayTextureGather\b/hipArrayTextureGather/g;
$ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPostSync\b/hipCooperativeLaunchMultiDeviceNoPostSync/g;
$ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPreSync\b/hipCooperativeLaunchMultiDeviceNoPreSync/g;
$ft{'define'} += s/\bcudaDeviceBlockingSync\b/hipDeviceScheduleBlockingSync/g;
$ft{'define'} += s/\bcudaDeviceLmemResizeToMax\b/hipDeviceLmemResizeToMax/g;
$ft{'define'} += s/\bcudaDeviceMapHost\b/hipDeviceMapHost/g;
@@ -1792,6 +1799,7 @@ sub simpleSubstitutions {
$ft{'define'} += s/\bcudaIpcMemLazyEnablePeerAccess\b/hipIpcMemLazyEnablePeerAccess/g;
$ft{'define'} += s/\bcudaMemAttachGlobal\b/hipMemAttachGlobal/g;
$ft{'define'} += s/\bcudaMemAttachHost\b/hipMemAttachHost/g;
$ft{'define'} += s/\bcudaOccupancyDefault\b/hipOccupancyDefault/g;
$ft{'define'} += s/\bcudaStreamDefault\b/hipStreamDefault/g;
$ft{'define'} += s/\bcudaStreamNonBlocking\b/hipStreamNonBlocking/g;
$ft{'define'} += s/\bcudaTextureType1D\b/hipTextureType1D/g;
@@ -447,6 +447,7 @@
| 717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`* | |
| 718 |*`CUDA_ERROR_INVALID_PC`* | |
| 719 |*`CUDA_ERROR_LAUNCH_FAILED`* |*`hipErrorLaunchFailure`* |
| 720 |*`CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`* |*`hipErrorCooperativeLaunchTooLarge`* |
| 800 |*`CUDA_ERROR_NOT_PERMITTED`* | |
| 801 |*`CUDA_ERROR_NOT_SUPPORTED`* |*`hipErrorNotSupported`* |
| 802 |*`CUDA_ERROR_SYSTEM_NOT_READY`* | | 10.0 |
@@ -1125,10 +1126,10 @@
| **CUDA** | **HIP** |**CUDA version\***|
|-----------------------------------------------------------|---------------------------------------------------------|------------------|
| `cuOccupancyMaxActiveBlocksPerMultiprocessor` |`hipOccupancyMaxActiveBlocksPerMultiprocessor` |
| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |`hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |
| `cuOccupancyMaxPotentialBlockSize` |`hipOccupancyMaxPotentialBlockSize` |
| `cuOccupancyMaxPotentialBlockSizeWithFlags` | |
| `cuOccupancyMaxActiveBlocksPerMultiprocessor` |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessor` |
| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |
| `cuOccupancyMaxPotentialBlockSize` |`hipOccupancyMaxPotentialBlockSize` |
| `cuOccupancyMaxPotentialBlockSizeWithFlags` | |
## **22. Texture Reference Management [DEPRECATED]**
@@ -800,7 +800,7 @@
| 717 |*`cudaErrorInvalidAddressSpace`* | | |
| 718 |*`cudaErrorInvalidPc`* | | |
| 719 |*`cudaErrorLaunchFailure`* | |*`hipErrorLaunchFailure`* |
| 720 |*`cudaErrorCooperativeLaunchTooLarge`* | 9.0 | |
| 720 |*`cudaErrorCooperativeLaunchTooLarge`* | 9.0 |*`hipErrorCooperativeLaunchTooLarge`* |
| 800 |*`cudaErrorNotPermitted`* | | |
| 801 |*`cudaErrorNotSupported`* | |*`hipErrorNotSupported`* |
| 802 |*`cudaErrorSystemNotReady`* | 10.0 | |
@@ -1045,7 +1045,7 @@
| define |`cudaMemAttachGlobal` | |`hipMemAttachGlobal` |
| define |`cudaMemAttachHost` | |`hipMemAttachHost` |
| define |`cudaMemAttachSingle` | | |
| define |`cudaOccupancyDefault` | | |
| define |`cudaOccupancyDefault` | |`hipOccupancyDefault` |
| define |`cudaOccupancyDisableCachingOverride` | | |
| define |`cudaPeerAccessDefault` | | |
| define |`cudaStreamDefault` | |`hipStreamDefault` |
+72
Féach ar an gComhad
@@ -0,0 +1,72 @@
# Profiling HIP Code
This section describes the tracing and debugging capabilities that HIP provides.
<!-- toc -->
- [Tracing and Debug](#tracing-and-debug)
* [Tracing HIP APIs](#tracing-hip-apis)
+ [Color](#color)
<!-- tocstop -->
## Tracing and Debug
### Tracing HIP APIs
The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API environment variable.
The trace prints two messages for each API - one at the beginning of the API call (line starts with "<<") and one at the end of the API call (line ends with ">>").
Here's an example for one API followed by a description for the sections of the trace:
```
<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
hip-api tid:1.6 hipMemcpy ret= 0 (hipSuccess)>>
```
- `<<hip-api` is the header used for all HIP API debug messages. The message is also shown in a specific color. This can be used to distinguish this API from other HIP or application messages.
- `tid:1.6` indicates that this API call came from thread #1 and is the 6th API call in that thread. When the first API in a new thread is called, HIP will associates a short sequential ID with that thread. You can see the full thread ID (reported by C++) as 0x7f6183b097c0 in the example below.
- `hipMemcpy` is the name of the API.
- The first line then prints a comma-separated list of the arguments to the function. APIs which return values to the caller by writing to pointers will show the pointer addresses rather than the pointer contents. This behavior may change in the future.
- The second line shows the completion of the API, including the numeric return value (`ret= 0`) as well as an string representation for the error code (`hipSuccess`). If the returned error code is non-zero, then the csecond line message is shown in red (unless HIP_TRACE_API_COLOR is "none" - see below).
Heres a specific example showing the output of the [square](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square) program running on HIP:
```
$ HIP_TRACE_API=1 ./square.hip.out
hip-api tid:1:HIP initialized short_tid#1 (maps to full_tid: 0x7f6183b097c0)
<<hip-api tid:1.1 hipGetDeviceProperties (0x7ffddb673e08, 0)
hip-api tid:1.1 hipGetDeviceProperties ret= 0 (hipSuccess)>>
info: running on device gfx803
info: allocate host mem ( 7.63 MB)
info: allocate device mem ( 7.63 MB)
<<hip-api tid:1.2 hipMalloc (0x7ffddb673fb8, 4000000)
hip-api tid:1.2 hipMalloc ret= 0 (hipSuccess)>>
<<hip-api tid:1.3 hipMalloc (0x7ffddb673fb0, 4000000)
hip-api tid:1.3 hipMalloc ret= 0 (hipSuccess)>>
info: copy Host2Device
<<hip-api tid:1.4 hipMemcpy (0x50409d000, 0x7f32158ac010, 4000000, hipMemcpyHostToDevice)
hip-api tid:1.4 hipMemcpy ret= 0 (hipSuccess)>>
info: launch 'vector_square' kernel
1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0
info: copy Device2Host
<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
hip-api tid:1.6 hipMemcpy ret= 0 (hipSuccess)>>
info: check result
PASSED!
```
HIP_TRACE_API supports multiple levels of debug information:
- 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset.
- 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel
- 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*.
- 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
These can be combined. For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU.
#### Color
Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable. Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White.
None will disable use of color control codes for both the opening and closing and may be useful when saving the trace file or when a pure text trace is desired.
+2 -1
Féach ar an gComhad
@@ -447,7 +447,8 @@ if len(sys.argv) > 3: OUTPUT = sys.argv[3]
# API declaration map
api_map = {
'hipHccModuleLaunchKernel': ''
'hipHccModuleLaunchKernel': '',
'hipExtModuleLaunchKernel': ''
}
# API options map
opts_map = {}
+19 -17
Féach ar an gComhad
@@ -42,10 +42,10 @@ After applying all the matchers, the output HIP source is produced.
`hipify-clang` requires:
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
| **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** |
|:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -67,7 +67,7 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
| [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
| [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + |
| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + |
`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.
@@ -158,7 +158,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
**LLVM 10.0.0 or newer:**
1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources;
1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources;
2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
**Linux**:
@@ -168,6 +168,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
-DLLVM_SOURCE_DIR=../llvm-project \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-DLLVM_ENABLE_PROJECTS="clang" \
-DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON
-DCMAKE_BUILD_TYPE=Release \
../llvm-project/llvm
make -j install
@@ -181,6 +182,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
-DLLVM_SOURCE_DIR=../llvm-project \
-DLLVM_TARGETS_TO_BUILD="NVPTX" \
-DLLVM_ENABLE_PROJECTS="clang" \
-DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON
-DCMAKE_BUILD_TYPE=Release \
-Thost=x64 \
../llvm-project/llvm
@@ -247,7 +249,7 @@ On Linux the following configurations are tested:
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
Minimum build system requirements for the above configurations:
@@ -393,19 +395,19 @@ Testing Time: 3.07s
```
### <a name="windows"></a > hipify-clang: Windows
On Windows 10 the following configurations are tested:
*Tested configurations:*
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
| **LLVM** | **CUDA** | **cuDNN** | **Visual Studio** | **cmake** | **Python** |
|:--------------:|---------:|--------------------:|--------------------------:|----------:|-----------:|
| 5.0.0 - 5.0.2 | 8.0 | 5.1.10 - 7.1.4.18 | 2017.15.5.2 | 3.5.1 | 3.6.4 |
| 6.0.0 - 6.0.1 | 9.0 | 7.0.5.15 - 7.6.5.32 | 2017.15.5.5 | 3.6.0 | 3.7.2 |
| 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 |
| 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 |
| 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 |
| 10.0.0-rc1-rc3 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 |
| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.5 | 3.8.2 |
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32
LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32
Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2:
Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).
Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
*Building with testing support on `Windows 10` by `Visual Studio 16 2019`:*
```shell
cmake
@@ -429,7 +431,7 @@ cmake
-- - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm
-- - Include path : F:/LLVM/9.0.1/dist/include
-- - Binary path : F:/LLVM/9.0.1/dist/bin
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.1", minimum required is "3.6")
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
-- Found lit: C:/Program Files/Python38/Scripts/lit.exe
-- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
@@ -545,9 +545,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{
// 5.21. Occupancy
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}},
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxPotentialBlockSize
{"cuOccupancyMaxPotentialBlockSize", {"hipOccupancyMaxPotentialBlockSize", "", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxPotentialBlockSizeWithFlags
@@ -1047,7 +1047,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
{"CUoccupancy_flags_enum", {"hipOccupancyFlags", "", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
// CUoccupancy_flags enum values
// cudaOccupancyDefault
{"CU_OCCUPANCY_DEFAULT", {"hipOccupancyDefault", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x00
{"CU_OCCUPANCY_DEFAULT", {"hipOccupancyDefault", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 0x00
// cudaOccupancyDisableCachingOverride
{"CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE", {"hipOccupancyDisableCachingOverride", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01
@@ -1284,6 +1284,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
{"CUDA_ERROR_INVALID_PC", {"hipErrorInvalidPc", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 718
// cudaErrorLaunchFailure
{"CUDA_ERROR_LAUNCH_FAILED", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 719
// cudaErrorCooperativeLaunchTooLarge
{"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 720
// cudaErrorNotPermitted
{"CUDA_ERROR_NOT_PERMITTED", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 800
// cudaErrorNotSupported
@@ -1598,9 +1600,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
// cudaArrayColorAttachment
{"CUDA_ARRAY3D_COLOR_ATTACHMENT", {"hipArrayColorAttachment", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x20
// cudaCooperativeLaunchMultiDeviceNoPreSync
{"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01
{"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_DRIVER}}, // 0x01
// cudaCooperativeLaunchMultiDeviceNoPostSync
{"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x02
{"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_DRIVER}}, // 0x02
// cudaExternalMemoryDedicated
{"CUDA_EXTERNAL_MEMORY_DEDICATED", {"hipExternalMemoryDedicated", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x1
// cudaExternalSemaphoreSignalSkipNvSciBufMemSync
@@ -787,8 +787,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
{"cudaErrorInvalidPc", {"hipErrorInvalidPc", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 718
// CUDA_ERROR_LAUNCH_FAILED
{"cudaErrorLaunchFailure", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 719
// no analogue
{"cudaErrorCooperativeLaunchTooLarge", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 720
// CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE
{"cudaErrorCooperativeLaunchTooLarge", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 720
// CUDA_ERROR_NOT_PERMITTED
{"cudaErrorNotPermitted", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 800
// CUDA_ERROR_NOT_SUPPORTED
@@ -1327,9 +1327,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
// CUDA_ARRAY3D_COLOR_ATTACHMENT
{"cudaArrayColorAttachment", {"hipArrayColorAttachment", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x20
// CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC
{"cudaCooperativeLaunchMultiDeviceNoPreSync", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01
{"cudaCooperativeLaunchMultiDeviceNoPreSync", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_RUNTIME}}, // 0x01
// CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC
{"cudaCooperativeLaunchMultiDeviceNoPostSync", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x02
{"cudaCooperativeLaunchMultiDeviceNoPostSync", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_RUNTIME}}, // 0x02
// CU_DEVICE_CPU ((CUdevice)-1)
{"cudaCpuDeviceId", {"hipCpuDeviceId", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // ((int)-1)
// CU_DEVICE_INVALID ((CUdevice)-2)
@@ -1412,7 +1412,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
// no analogue
{"cudaTextureTypeCubemapLayered", {"hipTextureTypeCubemapLayered", "", CONV_DEFINE, API_RUNTIME}}, // 0xFC
// CU_OCCUPANCY_DEFAULT
{"cudaOccupancyDefault", {"hipOccupancyDefault", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x00
{"cudaOccupancyDefault", {"hipOccupancyDefault", "", CONV_DEFINE, API_RUNTIME}}, // 0x00
// CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
{"cudaOccupancyDisableCachingOverride", {"hipOccupancyDisableCachingOverride", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01
// CU_STREAM_DEFAULT
+12 -8
Féach ar an gComhad
@@ -31,9 +31,11 @@ THE SOFTWARE.
#include <string>
#include <utility>
#include <vector>
#include <unordered_set>
namespace hip_impl {
#if !defined(DISABLE_REDUCED_GPU_BLOB_COPY)
std::unordered_set<std::string>& get_all_gpuarch();
#endif
inline
std::string transmogrify_triple(const std::string& triple)
{
@@ -43,7 +45,6 @@ std::string transmogrify_triple(const std::string& triple)
if (triple.find(old_prefix) == 0) {
return new_prefix + triple.substr(sizeof(old_prefix) - 1);
}
return (triple.find(new_prefix) == 0) ? triple : "";
}
@@ -114,9 +115,7 @@ class Bundled_code_header {
friend inline bool read(RandomAccessIterator f, RandomAccessIterator l,
Bundled_code_header& x) {
if (f == l) return false;
std::copy_n(f, sizeof(x.header_.cbuf_), x.header_.cbuf_);
if (valid(x)) {
x.bundles_.resize(x.header_.bundle_cnt_);
@@ -126,11 +125,16 @@ class Bundled_code_header {
it += sizeof(y.header.cbuf);
y.triple.assign(it, it + y.header.triple_sz);
#ifdef DISABLE_REDUCED_GPU_BLOB_COPY
std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob));
#else
auto& gpuArch = get_all_gpuarch();
auto itgpuArch = std::find(gpuArch.begin(),gpuArch.end(),y.triple);
if (itgpuArch != gpuArch.end()){
std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob));
}
#endif
it += y.header.triple_sz;
x.bundled_code_size = std::max(x.bundled_code_size,
y.header.offset + y.header.bundle_sz);
}
+134
Féach ar an gComhad
@@ -319,6 +319,41 @@ double __shfl(double var, int src_lane, int width = warpSize) {
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl(long var, int src_lane, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
#endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
@@ -356,6 +391,39 @@ double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
@@ -393,6 +461,39 @@ double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
@@ -430,6 +531,39 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) {
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_xor(long var, int lane_mask, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
+3 -6
Féach ar an gComhad
@@ -255,14 +255,14 @@ typedef struct hipMemcpy3DParms {
hipArray_t srcArray;
struct hipPos srcPos;
struct hipPitchedPtr srcPtr;
hipArray_t dstArray;
struct hipPos dstPos;
struct hipPitchedPtr dstPtr;
struct hipExtent extent;
enum hipMemcpyKind kind;
} hipMemcpy3DParms;
typedef struct HIP_MEMCPY3D {
size_t Depth;
size_t Height;
size_t WidthInBytes;
@@ -283,10 +283,7 @@ typedef struct hipMemcpy3DParms {
size_t srcLOD;
hipMemoryType srcMemoryType;
size_t srcPitch;
size_t srcXInBytes;
size_t srcY;
size_t srcZ;
}hipMemcpy3DParms;
} HIP_MEMCPY3D;
static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
size_t ysz) {
+32 -4
Féach ar an gComhad
@@ -212,6 +212,11 @@ enum hipLimit_t {
#define hipArrayCubemap 0x04
#define hipArrayTextureGather 0x08
#define hipOccupancyDefault 0x00
#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
/*
* @brief hipJitOption
* @enum
@@ -2903,7 +2908,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
* default stream is used with associated synchronization rules.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
*/
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
void** kernelParams, unsigned int sharedMemBytes,
@@ -2917,7 +2922,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim
* @param [in] numDevices Size of the launchParamsList array.
* @param [in] flags Flags to control launch behavior.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
*/
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags);
@@ -2954,13 +2959,36 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
* @brief Returns occupancy for a device function.
*
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] func Kernel function (hipFunction) for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
*/
hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
/**
* @brief Returns occupancy for a device function.
*
* @param [out] numBlocks Returned occupancy
* @param [in] f Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault));
/**
* @brief Returns occupancy for a device function.
*
* @param [out] numBlocks Returned occupancy
* @param [in] f Kernel function(hipFunction_t) for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
/**
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
+20 -46
Féach ar an gComhad
@@ -19,10 +19,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#ifndef HIPRTC_H
#define HIPRTC_H
#include <cstddef>
#include <string>
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
enum hiprtcResult {
HIPRTC_SUCCESS = 0,
@@ -41,29 +45,22 @@ enum hiprtcResult {
const char* hiprtcGetErrorString(hiprtcResult result);
inline
hiprtcResult hiprtcVersion(int* major, int* minor) noexcept
{ // TODO: NVRTC versioning is somewhat unclear.
if (!major || !minor) return HIPRTC_ERROR_INVALID_INPUT;
// TODO: this should be generic / set by the build infrastructure.
*major = 9;
*minor = 0;
hiprtcResult hiprtcVersion(int* major, int* minor);
return HIPRTC_SUCCESS;
}
struct _hiprtcProgram;
using hiprtcProgram = _hiprtcProgram*;
typedef struct _hiprtcProgram* hiprtcProgram;
hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog,
const char* name_expression);
hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions,
hiprtcResult hiprtcCompileProgram(hiprtcProgram prog,
int numOptions,
const char** options);
hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src,
const char* name, int numHeaders,
hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog,
const char* src,
const char* name,
int numHeaders,
const char** headers,
const char** includeNames);
@@ -76,37 +73,14 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog,
hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);
hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog,
std::size_t* logSizeRet);
size_t* logSizeRet);
hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, std::size_t* codeSizeRet);
hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
namespace hip_impl
{
char* demangle(const char* mangled_expression);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#if defined(HIPRTC_GET_TYPE_NAME)
#include <typeinfo>
#if defined(_WIN32)
#include <dbghelp.h>
template<typename>
hiprtcResult hiprtcGetTypeName(std::string*) = delete;
#else
template<typename T>
inline
hiprtcResult hiprtcGetTypeName(std::string* result)
{
if (!result) return HIPRTC_ERROR_INVALID_INPUT;
char * res= hip_impl::demangle(typeid(T).name());
result->assign(res == nullptr ? "" : res);
std::free(res);
return (result->empty()) ? HIPRTC_ERROR_INTERNAL_ERROR :
HIPRTC_SUCCESS;
}
#endif
#endif
#endif //HIPRTC_H
-18
Féach ar an gComhad
@@ -1,16 +1,13 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,23 +20,8 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H
#define HIP_INCLUDE_HIP_HIP_PROFILE_H
#warning "HIP Profiling through markers is deprecated, please check roctrace/rocTX support."
#if not defined(ENABLE_HIP_PROFILE)
#define ENABLE_HIP_PROFILE 1
#endif
#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE == 1)
#warning "HIP Markers are deprecated and would be removed soon."
#include <CXLActivityLogger.h>
#define HIP_SCOPED_MARKER(markerName, group) \
amdtScopedMarker __scopedMarker(markerName, group, nullptr);
#define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr);
#define HIP_END_MARKER() amdtEndMarker();
#else
#define HIP_SCOPED_MARKER(markerName, group)
#define HIP_BEGIN_MARKER(markerName, group)
#define HIP_END_MARKER()
#endif
#endif
+4
Féach ar an gComhad
@@ -255,6 +255,10 @@ typedef enum __HIP_NODISCARD hipError_t {
713, ///< Produced when trying to unlock a non-page-locked memory.
hipErrorLaunchFailure =
719, ///< An exception occurred on the device while executing a kernel.
hipErrorCooperativeLaunchTooLarge =
720, ///< This error indicates that the number of blocks launched per grid for a kernel
///< that was launched via cooperative launch APIs exceeds the maximum number of
///< allowed blocks for the current device
hipErrorNotSupported = 801, ///< Produced when the hip API is not supported/implemented
hipErrorUnknown = 999, //< Unknown error.
// HSA Runtime Error Codes start here.
+14
Féach ar an gComhad
@@ -140,6 +140,14 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind;
#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
#define hipOccupancyDefault cudaOccupancyDefault
#define hipCooperativeLaunchMultiDeviceNoPreSync \
cudaCooperativeLaunchMultiDeviceNoPreSync
#define hipCooperativeLaunchMultiDeviceNoPostSync \
cudaCooperativeLaunchMultiDeviceNoPostSync
// enum CUjit_option redefines
#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
@@ -267,6 +275,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
return hipErrorNotInitialized;
case cudaErrorLaunchFailure:
return hipErrorLaunchFailure;
case cudaErrorCooperativeLaunchTooLarge:
return hipErrorCooperativeLaunchTooLarge;
case cudaErrorPriorLaunchFailure:
return hipErrorPriorLaunchFailure;
case cudaErrorLaunchOutOfResources:
@@ -445,6 +455,8 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
return hipErrorHostMemoryNotRegistered;
case CUDA_ERROR_LAUNCH_FAILED:
return hipErrorLaunchFailure;
case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
return hipErrorCooperativeLaunchTooLarge;
case CUDA_ERROR_NOT_SUPPORTED:
return hipErrorNotSupported;
case CUDA_ERROR_UNKNOWN:
@@ -601,6 +613,8 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
return cudaErrorSetOnActiveProcess;
case hipErrorLaunchFailure:
return cudaErrorLaunchFailure;
case hipErrorCooperativeLaunchTooLarge:
return cudaErrorCooperativeLaunchTooLarge;
case hipErrorNotSupported:
return cudaErrorNotSupported;
// HSA: does not exist in CUDA
+2 -2
Féach ar an gComhad
@@ -26,7 +26,7 @@ target_include_directories(ca PUBLIC ${PROJECT_SOURCE_DIR}/src)
find_library(
hsart NAMES libhsa-runtime64.so libhsa-runtime64.so.1 HINTS ${HSA_PATH}/lib)
target_link_libraries(ca PUBLIC ${hsart})
target_compile_options(ca PUBLIC -Wall)
target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)
install(TARGETS ca RUNTIME DESTINATION bin)
#-------------------------------------CA---------------------------------------#
#-------------------------------------CA---------------------------------------#
+2 -10
Féach ar an gComhad
@@ -36,11 +36,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), rocm-profiler, comgr (>= 1.1)")
else()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
endif()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc")
@@ -50,11 +46,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, comgr >= 1.1")
else()
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
endif()
set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+6 -5
Féach ar an gComhad
@@ -4,16 +4,17 @@ ifeq (,$(HIP_PATH))
endif
HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11
EXE=hipDispatchLatency
CXXFLAGS = -O3
all: test_kernel.code ${EXE}
all: test_kernel.code hipDispatchLatency.out hipDispatchEnqueueRateMT.out
$(EXE): hipDispatchLatency.cpp
hipDispatchLatency.out: hipDispatchLatency.cpp
$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@
hipDispatchEnqueueRateMT.out: hipDispatchEnqueueRateMT.cpp
$(HIPCC) $(CXXFLAGS) hipDispatchEnqueueRateMT.cpp -o $@
test_kernel.code: test_kernel.cpp
$(HIP_PATH)/bin/hipcc --genco $(GENCO_FLAGS) $^ -o $@
clean:
rm -f *.o $(EXE)
rm -f *.o *.out
@@ -0,0 +1,167 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <stdio.h>
#include "hip/hip_runtime.h"
#ifdef __HIP_PLATFORM_HCC__
#include "hip/hip_ext.h"
#endif
#include <iostream>
#include <chrono>
#include <algorithm>
#include <atomic>
#include <thread>
#include <future>
#include <functional>
#define NUM_GROUPS 1
#define GROUP_SIZE 1
#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
__global__ void EmptyKernel() {}
// Helper to print various timing metrics
void print_timing(std::string test, std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1)
{
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
// remove top outliers due to nature of variability across large number of multi-threaded runs
std::sort(results.begin(), results.end(), std::greater<float>());
auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
auto end_iter = results.end();
// mean
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
total_us += (run_ms * 1000) / batch;
});
mean_us = total_us / TIMING_RUN_COUNT;
// stddev
total_us = 0;
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
float dev_us = ((run_ms * 1000) / batch) - mean_us;
total_us += dev_us * dev_us;
});
stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}
// Measure time taken to enqueue a kernel on the GPU using hipModuleLaunchKernel
void hipModuleLaunchKernel_enqueue_rate(std::atomic_int* shared, int max_threads)
{
//resources necessary for this thread
hipStream_t stream;
hipStreamCreate(&stream);
hipModule_t module;
hipFunction_t function;
hipModuleLoad(&module, "test_kernel.code");
hipModuleGetFunction(&function, module, "test");
void* kernel_params = nullptr;
std::array<float, TOTAL_RUN_COUNT> results;
//synchronize all threads, before running
int tid = shared->fetch_add(1, std::memory_order_release);
while (max_threads != shared->load(std::memory_order_acquire)) {}
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, stream, &kernel_params, nullptr);
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
}
print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipModuleLaunchKernel enqueue rate", results);
}
// Measure time taken to enqueue a kernel on the GPU using hipLaunchKernelGGL
void hipLaunchKernelGGL_enqueue_rate(std::atomic_int* shared, int max_threads)
{
//resources necessary for this thread
hipStream_t stream;
hipStreamCreate(&stream);
std::array<float, TOTAL_RUN_COUNT> results;
//synchronize all threads, before running
int tid = shared->fetch_add(1, std::memory_order_release);
while (max_threads != shared->load(std::memory_order_acquire)) {}
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream);
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
}
print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipLaunchKernelGGL enqueue rate", results);
}
// Simple thread pool
struct thread_pool {
thread_pool(int total_threads) : max_threads(total_threads) {}
void start(std::function<void(std::atomic_int*, int)> f) {
for (int i = 0; i < max_threads; ++i) {
threads.push_back(std::async(std::launch::async, f, &shared, max_threads));
}
}
void finish() {
for (auto&&thread : threads) {
thread.get();
}
threads.clear();
shared = {0};
}
~thread_pool() {
finish();
}
private:
std::atomic_int shared {0};
std::vector<std::future<void>> threads;
int max_threads = 1;
};
int main(int argc, char* argv[])
{
if (argc != 3) {
std::cerr << "Run test as 'hipDispatchEnqueueRateMT <num_threads> <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n";
return -1;
}
int max_threads = atoi(argv[1]);
int run_module_test = atoi(argv[2]);
if(max_threads < 1 || run_module_test < 0 || run_module_test > 1) {
std::cerr << "Invalid Input.\n";
std::cerr << "Run test as 'hipDispatchEnqueueRateMT <num_threads> <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n";
return -1;
}
thread_pool task(max_threads);
if(run_module_test == 0) {
task.start(hipModuleLaunchKernel_enqueue_rate);
task.finish();
} else {
task.start(hipLaunchKernelGGL_enqueue_rate);
task.finish();
}
return 0;
}
+14 -43
Féach ar an gComhad
@@ -39,6 +39,7 @@ THE SOFTWARE.
#include <algorithm>
#include <atomic>
#include <mutex>
#include <unordered_set>
#include <hc.hpp>
#include <hc_am.hpp>
@@ -71,7 +72,6 @@ int HIP_API_BLOCKING = 0;
int HIP_PRINT_ENV = 0;
int HIP_TRACE_API = 0;
std::string HIP_TRACE_API_COLOR("green");
int HIP_PROFILE_API = 0;
// TODO - DB_START/STOP need more testing.
std::string HIP_DB_START_API;
@@ -149,12 +149,10 @@ uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& a
if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) {
printf("info: resume profiling at %lu\n", apiSeqNum);
RESUME_PROFILING;
g_dbStartTriggers.pop_back();
};
if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) {
printf("info: stop profiling at %lu\n", apiSeqNum);
STOP_PROFILING;
g_dbStopTriggers.pop_back();
};
@@ -1294,9 +1292,6 @@ void HipReadEnv() {
"executes.");
READ_ENV_S(release, HIP_TRACE_API_COLOR, 0,
"Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White");
READ_ENV_I(release, HIP_PROFILE_API, 0,
"Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, "
"0x2=full API name including args.");
READ_ENV_S(release, HIP_DB_START_API, 0,
"Comma-separated list of tid.api_seq_num for when to start debug and profiling.");
READ_ENV_S(release, HIP_DB_STOP_API, 0,
@@ -1372,14 +1367,6 @@ void HipReadEnv() {
HIP_DB |= 0x1;
}
if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) {
fprintf(stderr,
"warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps "
"enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n",
HIP_PROFILE_API);
HIP_PROFILE_API = 0;
}
if (HIP_DB) {
fprintf(stderr, "HIP_DB=0x%x [%s]\n", HIP_DB, HIP_DB_string(HIP_DB).c_str());
}
@@ -1423,11 +1410,6 @@ void HipReadEnv() {
// This function creates a vector with only the GPU accelerators.
// It is called with C++11 call_once, which provided thread-safety.
void ihipInit() {
#if COMPILE_HIP_ATP_MARKER
amdtInitializeActivityLogger();
amdtScopedMarker("ihipInit", "HIP", NULL);
#endif
HipReadEnv();
@@ -1617,7 +1599,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) {
void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
const hipStream_t stream) {
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API ||
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
(COMPILE_HIP_DB & HIP_TRACE_API)) {
GET_TLS();
std::stringstream os;
@@ -1630,14 +1612,6 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
std::string fullStr;
recordApiTrace(tls, &fullStr, os.str());
}
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernelName;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
}
}
@@ -1696,9 +1670,6 @@ void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launc
tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");
stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed);
if (HIP_PROFILE_API) {
MARKER_END();
}
}
//=================================================================================================
@@ -1796,6 +1767,8 @@ const char* ihipErrorString(hipError_t hip_error) {
return "hipErrorMissingConfiguration";
case hipErrorLaunchFailure:
return "hipErrorLaunchFailure";
case hipErrorCooperativeLaunchTooLarge:
return "hipErrorCooperativeLaunchTooLarge";
case hipErrorPriorLaunchFailure:
return "hipErrorPriorLaunchFailure";
case hipErrorLaunchTimeOut:
@@ -2478,29 +2451,17 @@ bool ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width,
return retStatus;
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Profiler, really these should live elsewhere:
hipError_t hipProfilerStart() {
HIP_INIT_API(hipProfilerStart);
#if COMPILE_HIP_ATP_MARKER
amdtResumeProfiling(AMDT_ALL_PROFILING);
#endif
return ihipLogStatus(hipSuccess);
};
hipError_t hipProfilerStop() {
HIP_INIT_API(hipProfilerStop);
#if COMPILE_HIP_ATP_MARKER
amdtStopProfiling(AMDT_ALL_PROFILING);
#endif
return ihipLogStatus(hipSuccess);
};
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// HCC-specific accessor functions:
@@ -2540,6 +2501,16 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a
// TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)
namespace hip_impl {
std::unordered_set<std::string>& get_all_gpuarch() {
static std::unordered_set<std::string> r{};
static std::once_flag init;
std::call_once(init, []() {
for (int i=0; i < g_deviceCnt; i++){
r.insert("hcc-amdgcn-amd-amdhsa--gfx"+std::to_string(g_deviceArray[i]->_props.gcnArch));
}});
return r;
}
std::vector<hsa_agent_t> all_hsa_agents() {
std::vector<hsa_agent_t> r{};
std::vector<hc::accelerator> visible_accelerators;
+2 -39
Féach ar an gComhad
@@ -63,7 +63,6 @@ extern int HIP_LAUNCH_BLOCKING;
extern int HIP_API_BLOCKING;
extern int HIP_PRINT_ENV;
extern int HIP_PROFILE_API;
// extern int HIP_TRACE_API;
extern int HIP_ATP;
extern int HIP_DB;
@@ -250,34 +249,6 @@ extern const char* API_COLOR_END;
// Must be enabled at runtime with HIP_TRACE_API
#define COMPILE_HIP_TRACE_API 0x3
// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
#ifndef COMPILE_HIP_ATP_MARKER
#define COMPILE_HIP_ATP_MARKER 0
#endif
// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
// boundary.
// TODO - currently we print the trace message at the beginning. if we waited, we could also
// tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
// by hipMalloc).
#if COMPILE_HIP_ATP_MARKER
#include "CXLActivityLogger.h"
#define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
#define MARKER_END() amdtEndMarker();
#define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
#define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
#else
// Swallow scoped markers:
#define MARKER_BEGIN(markerName, group)
#define MARKER_END()
#define RESUME_PROFILING
#define STOP_PROFILING
#endif
//---
// HIP Trace modes - use with HIP_TRACE_API=...
#define TRACE_ALL 0 // 0x01
@@ -336,22 +307,17 @@ static inline uint64_t getTicks() { return hc::get_system_ticks(); }
//---
extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#if (COMPILE_HIP_TRACE_API & 0x1)
#define API_TRACE(forceTrace, ...) \
GET_TLS(); \
uint64_t hipApiStartTick = 0; \
{ \
tls->tidInfo.incApiSeqNum(); \
if (forceTrace || \
(HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
(COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \
std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
std::string fullStr; \
hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
if (HIP_PROFILE_API == 0x1) { \
MARKER_BEGIN(__func__, "HIP") \
} else if (HIP_PROFILE_API == 0x2) { \
MARKER_BEGIN(fullStr.c_str(), "HIP"); \
} \
} \
}
@@ -398,9 +364,6 @@ extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::st
tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
} \
if (HIP_PROFILE_API) { \
MARKER_END(); \
} \
localHipStatus; \
})
+175 -160
Féach ar an gComhad
@@ -37,7 +37,6 @@ __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
namespace hip_internal {
namespace {
inline
const char* hsa_to_string(hsa_status_t err) noexcept
{
@@ -140,6 +139,8 @@ namespace {
return r;
}()};
constexpr std::uint32_t is_cpu_owned{UINT32_MAX};
inline
hsa_amd_pointer_info_t info(const void* p)
{
@@ -149,13 +150,14 @@ namespace {
const_cast<void*>(p), &r, nullptr, nullptr, nullptr),
__FILE__, __func__, __LINE__);
r.size = is_large_BAR || (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) ?
UINT32_MAX : sizeof(hsa_amd_pointer_info_t);
if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = is_cpu_owned;
return r;
}
constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline.
thread_local const std::unique_ptr<void, void (*)(void *)> staging_buffer{
[]() {
@@ -202,7 +204,7 @@ namespace {
} // Unnamed namespace.
inline
void do_copy(void* __restrict dst, const void* __restrict src, std::size_t n,
void do_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_agent_t da, hsa_agent_t sa) {
hsa_signal_silent_store_relaxed(copy_signal, 1);
throwing_result_check(
@@ -224,14 +226,20 @@ void do_std_memcpy(
inline
void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t si) {
// TODO: characterise direct largeBAR reads from agent-allocated memory.
// if (si.size == UINT32_MAX) {
// return do_std_memcpy(dst, src, n);
// }
const auto di{info(dst)};
const auto is_locked{di.type == HSA_EXT_POINTER_TYPE_LOCKED};
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
if (!is_locked && si.size == is_cpu_owned) {
return do_std_memcpy(dst, src, n);
}
if (!is_locked && is_large_BAR && n <= max_d2h_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}
if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
return do_copy(dst, src, n, si.agentOwner, si.agentOwner);
}
if (is_locked) {
dst = static_cast<char*>(di.agentBaseAddress) +
(static_cast<char*>(dst) -
static_cast<char*>(di.hostBaseAddress));
@@ -247,7 +255,7 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
throwing_result_check(hsa_amd_memory_lock(dst, n, &si.agentOwner, 1,
const_cast<void**>(&dst)),
__FILE__, __func__, __LINE__);
__FILE__, __func__, __LINE__);
do_copy(dst, src, n, si.agentOwner, si.agentOwner);
}
@@ -256,16 +264,23 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di) {
if (di.size == UINT32_MAX) {
const auto si{info(const_cast<void*>(src))};
const auto is_locked{si.type == HSA_EXT_POINTER_TYPE_LOCKED};
if (!is_locked && di.size == is_cpu_owned) {
return do_std_memcpy(dst, src, n);
}
if (!is_locked && is_large_BAR && n <= max_h2d_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}
if (si.type == HSA_EXT_POINTER_TYPE_HSA) {
return do_copy(dst, src, n, di.agentOwner, di.agentOwner);
}
const auto si{info(const_cast<void*>(src))};
if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) {
if (is_locked) {
src = static_cast<char*>(si.agentBaseAddress) +
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
do_copy(dst, src, n, di.agentOwner, di.agentOwner);
}
else if (n <= staging_sz) {
@@ -279,7 +294,7 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
throwing_result_check(hsa_amd_memory_lock(const_cast<void*>(src), n,
&di.agentOwner, 1,
const_cast<void**>(&src)),
__FILE__, __func__, __LINE__);
__FILE__, __func__, __LINE__);
do_copy(dst, src, n, di.agentOwner, di.agentOwner);
}
@@ -288,69 +303,36 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) {
if (di.size == UINT32_MAX && si.size == UINT32_MAX) {
if (di.size == is_cpu_owned && si.size == is_cpu_owned) {
return do_std_memcpy(dst, src, n);
}
if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si);
if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di);
std::unique_ptr<void, void (*)(void*)> lck0{
nullptr, [](void* p) { hsa_amd_memory_unlock(p); }};
std::unique_ptr<void, void (*)(void*)> lck1{nullptr, lck0.get_deleter()};
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr,
di.agentBaseAddress),
__FILE__, __func__, __LINE__);
switch (si.type) {
case HSA_EXT_POINTER_TYPE_HSA:
if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
hsa_memory_copy(dst, src, n);
return; // TODO: do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN ||
di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
return d2h_copy(dst, src, n, si);
}
break;
case HSA_EXT_POINTER_TYPE_LOCKED:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, si.hostBaseAddress, n);
return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, si.hostBaseAddress, n);
return;
}
src = si.agentBaseAddress;
si.agentOwner = di.agentOwner;
break;
case HSA_EXT_POINTER_TYPE_UNKNOWN:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, src, n);
return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, src, n);
return;
}
return h2d_copy(dst, src, n, di);
default: do_copy(dst, src, n, di.agentOwner, si.agentOwner); break;
}
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
inline
void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
hipMemcpyKind k) noexcept {
hipMemcpyKind k) {
switch (k) {
case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
case hipMemcpyHostToDevice:
return is_large_BAR ? do_std_memcpy(dst, src, n)
: h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost:
// TODO: characterise direct largeBAR reads from agent-allocated memory.
return /*is_large_BAR ? do_std_memcpy(dst, src, n)
: */d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: hsa_memory_copy(dst, src, n); break;
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: {
const auto di{info(dst)};
const auto si{info(src)};
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr,
di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
default: return generic_copy(dst, src, n, info(dst), info(src));
}
}
@@ -1290,7 +1272,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
if (kind == hipMemcpyDeviceToHost || kind == hipMemcpyHostToHost) {
return ihipLogStatus(hipErrorInvalidMemcpyDirection);
} else if (kind == hipMemcpyDeviceToDevice) {
@@ -1322,7 +1304,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
if (src == nullptr || dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
if (kind == hipMemcpyHostToDevice || kind == hipMemcpyHostToHost) {
return ihipLogStatus(hipErrorInvalidMemcpyDirection);
} else if (kind == hipMemcpyDeviceToDevice) {
@@ -1540,111 +1522,144 @@ hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t
return ihipLogStatus(e);
}
int getByteSizeFromFormat(const hipChannelFormatDesc& desc){
int byteSize =0;
switch (desc.f) {
case hipChannelFormatKindUnsigned:
switch (desc.x) {
case 32:
byteSize = sizeof(uint32_t);
break;
case 16:
byteSize = sizeof(uint16_t);
break;
case 8:
byteSize = sizeof(uint8_t);
break;
default:
byteSize = sizeof(uint32_t);
}
break;
case hipChannelFormatKindSigned:
switch (desc.x) {
case 32:
byteSize = sizeof(int32_t);
break;
case 16:
byteSize = sizeof(int16_t);
break;
case 8:
byteSize = sizeof(int8_t);
break;
default:
byteSize = sizeof(int32_t);
}
break;
case hipChannelFormatKindFloat:
switch (desc.x) {
case 32:
byteSize = sizeof(float);
break;
case 16:
byteSize = sizeof(_Float16);
break;
default:
byteSize = sizeof(float);
}
break;
case hipChannelFormatKindNone:
default:
break;
}
return byteSize;
}
hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bool isAsync) {
hipError_t e = hipSuccess;
if(p) {
size_t byteSize, width, height, depth, widthInBytes, srcPitch, dstPitch, ySize;
hipChannelFormatDesc desc;
void* srcPtr;void* dstPtr;
size_t dstByteSize, srcByteSize, copyWidth, copyHeight, copyDepth, widthInBytes, srcPitch, dstPitch, srcYsize, dstYsize;
size_t srcXoffset, srcYoffset, srcZoffset, dstXoffset, dstYoffset, dstZoffset;
size_t srcWidth, srcHeight, srcDepth, dstWidth, dstHeight, dstDepth;
void* srcPtr, *dstPtr;
bool copyWidthUpdate= false;
copyDepth = p->extent.depth;
copyHeight = p->extent.height;
copyWidth = p->extent.width; // in bytes ?
dstXoffset = p->dstPos.x;
dstYoffset = p->dstPos.y;
dstZoffset = p->dstPos.z;
srcXoffset = p->srcPos.x;
srcYoffset = p->srcPos.y;
srcZoffset = p->srcPos.z;
if (p->dstArray != nullptr) {
if (p->dstArray->isDrv == false) {
switch (p->dstArray->desc.f) {
case hipChannelFormatKindSigned:
byteSize = sizeof(int);
break;
case hipChannelFormatKindUnsigned:
byteSize = sizeof(unsigned int);
break;
case hipChannelFormatKindFloat:
byteSize = sizeof(float);
break;
case hipChannelFormatKindNone:
byteSize = sizeof(size_t);
break;
default:
byteSize = 0;
break;
}
depth = p->extent.depth;
height = p->extent.height;
width = p->extent.width;
widthInBytes = p->extent.width * byteSize;
srcPitch = p->srcPtr.pitch;
srcPtr = p->srcPtr.ptr;
ySize = p->srcPtr.ysize;
desc = p->dstArray->desc;
dstPtr = p->dstArray->data;
hsa_ext_image_data_info_t imageInfo;
if(hipTextureType2DLayered == p->dstArray->textureType)
GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, desc, imageInfo, depth);
else
GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, desc, imageInfo);
dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth);
} else {
depth = p->Depth;
height = p->Height;
widthInBytes = p->WidthInBytes;
width = p->dstArray->width;
hsa_ext_image_channel_order_t channelOrder;
switch(p->dstArray->NumChannels) {
case 2:
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
break;
case 3:
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB;
break;
case 4:
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
break;
case 1:
default:
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
break;
}
hsa_ext_image_channel_type_t channelType;
e = ihipArrayToImageFormat(p->dstArray->Format,channelType);
srcPitch = p->srcPitch;
srcPtr = (void*)p->srcHost;
ySize = p->srcHeight;
dstPtr = p->dstArray->data;
hsa_ext_image_data_info_t imageInfo;
if(hipTextureType2DLayered == p->dstArray->textureType)
GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, channelOrder, channelType, imageInfo, depth);
else
GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, channelOrder, channelType, imageInfo);
dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth);
if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){
return hipErrorInvalidValue;
}
// Array destination
dstByteSize = getByteSizeFromFormat(p->dstArray->desc);
hipChannelFormatDesc desc;
desc = p->dstArray->desc;
dstPtr = p->dstArray->data;
dstWidth = p->dstArray->width;
dstHeight = p->dstArray->height;
dstDepth = p->dstArray->depth;
dstPitch = dstByteSize * alignUp(dstWidth, IMAGE_PITCH_ALIGNMENT);
if(!copyWidthUpdate) {
copyWidth = copyWidth * dstByteSize;
copyWidthUpdate = true;
}
} else {
// Non array destination
depth = p->extent.depth;
height = p->extent.height;
widthInBytes = p->extent.width;
srcPitch = p->srcPtr.pitch;
srcPtr = p->srcPtr.ptr;
//Non Array destination
dstPtr = p->dstPtr.ptr;
ySize = p->srcPtr.ysize;
dstWidth = p->dstPtr.xsize;
dstHeight = p->dstPtr.ysize;
dstPitch = p->dstPtr.pitch;
}
if (p->srcArray != nullptr) {
if ((p->srcArray->isDrv == true) ||( p->srcPtr.ptr!= nullptr)){
return hipErrorInvalidValue;
}
// Array source
srcByteSize = getByteSizeFromFormat(p->srcArray->desc);
hipChannelFormatDesc desc;
desc = p->srcArray->desc;
srcPtr = p->srcArray->data;
srcWidth = p->srcArray->width;
srcHeight = p->srcArray->height;
srcDepth = p->srcArray->depth;
srcPitch = srcByteSize * alignUp(srcWidth, IMAGE_PITCH_ALIGNMENT);
if(!copyWidthUpdate) {
copyWidth = copyWidth * srcByteSize;
copyWidthUpdate = true;
}
} else {
//Non Array source
srcPtr = p->srcPtr.ptr;
srcWidth = p->srcPtr.xsize;
srcHeight = p->srcPtr.ysize;
srcPitch = p->srcPtr.pitch;
}
stream = ihipSyncAndResolveStream(stream);
try {
if((widthInBytes == dstPitch) && (widthInBytes == srcPitch)) {
if((copyWidth == dstPitch) && (copyWidth == srcPitch)&& (copyHeight == dstHeight) &&(copyHeight == srcHeight)) {
if(isAsync)
stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind);
stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind);
else
stream->locked_copySync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind, false);
stream->locked_copySync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind, false);
} else {
for (int i = 0; i < depth; i++) {
for (int j = 0; j < height; j++) {
// TODO: p->srcPos or p->dstPos are not 0.
for (int i = 0; i < copyDepth; i++) {
for (int j = 0; j < copyHeight; j++) {
unsigned char* src =
(unsigned char*)srcPtr + i * ySize * srcPitch + j * srcPitch;
(unsigned char*)srcPtr + (i + srcZoffset) * srcHeight * srcPitch + (j + srcYoffset) * srcPitch + srcXoffset;
unsigned char* dst =
(unsigned char*)dstPtr + i * height * dstPitch + j * dstPitch;
(unsigned char*)dstPtr + (i + dstZoffset) * dstHeight * dstPitch + (j + dstYoffset) * dstPitch + dstXoffset;
if(isAsync)
stream->locked_copyAsync(dst, src, widthInBytes, p->kind);
stream->locked_copyAsync(dst, src, copyWidth, p->kind);
else
stream->locked_copySync(dst, src, widthInBytes, p->kind);
stream->locked_copySync(dst, src, copyWidth, p->kind);
}
}
}
+78 -35
Féach ar an gComhad
@@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device);
return ihipLogStatus(hipStatus); \
}
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX,
uint32_t gridSizeY, uint32_t gridSizeZ,
uint32_t blockSizeX, uint32_t blockSizeY,
uint32_t blockSizeZ, size_t sharedMemBytes,
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
@@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX;
size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY;
size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
if (ctx == nullptr) {
ret = hipErrorInvalidDevice;
@@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
lp.dynamic_group_mem_bytes =
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
hStream = ihipPreLaunchKernel(
hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ),
dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked);
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);
hsa_kernel_dispatch_packet_t aql;
@@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
// aql.completion_signal._handle = 0;
// aql.kernarg_address = 0;
aql.workgroup_size_x = blockSizeX;
aql.workgroup_size_y = blockSizeY;
aql.workgroup_size_z = blockSizeZ;
aql.workgroup_size_x = localWorkSizeX;
aql.workgroup_size_y = localWorkSizeY;
aql.workgroup_size_z = localWorkSizeZ;
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
@@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
void** kernelParams, void** extra) {
HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes,
hStream, kernelParams, extra);
size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX;
size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY;
size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY,
blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
}
@@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
return hipErrorInvalidValue;
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
}
@@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
return hipErrorInvalidValue;
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
}
@@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
}
GET_TLS();
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
// launch kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x,
lp.gridDim.y,
lp.gridDim.z,
lp.gridDim.x * lp.blockDim.x,
lp.gridDim.y * lp.blockDim.y,
lp.gridDim.z * lp.blockDim.z,
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
@@ -424,6 +431,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
return hipErrorInvalidConfiguration;
}
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
// Prepare the kernel descriptor for initializing the GWS
hipFunction_t gwsKD = ps.kernel_descriptor(
reinterpret_cast<std::uintptr_t>(&init_gws),
@@ -483,9 +498,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
// launch the main kernel
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x,
gridDim.y,
gridDim.z,
gridDim.x * blockDimX.x,
gridDim.y * blockDimX.y,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true, impCoopParams);
@@ -620,6 +635,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
void* impCoopParams[1];
ulong prev_sum = 0;
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
// launch the main kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -636,10 +653,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
impCoopParams[0] = &mg_info_ptr[i];
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x,
lp.gridDim.y,
lp.gridDim.z,
lp.gridDim.x * lp.blockDim.x,
lp.gridDim.y * lp.blockDim.y,
lp.gridDim.z * lp.blockDim.z,
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
@@ -1469,6 +1494,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
}
hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
{
HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
unsigned int flags)
@@ -1479,6 +1513,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
}
hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk,
unsigned int flags)
{
HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk));
}
hipError_t hipLaunchKernel(
const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args,
size_t sharedMemBytes, hipStream_t stream)
+17 -10
Féach ar an gComhad
@@ -128,17 +128,24 @@ hipError_t ihipEnablePeerAccess(TlsData* tls, hipCtx_t peerCtx, unsigned int fla
if (thisCtx == peerCtx) {
err = hipErrorInvalidDevice; // Can't enable peer access to self.
} else if ((thisCtx != NULL) && (peerCtx != NULL)) {
LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
// Add thisCtx to peerCtx's access list so that new allocations on peer will be made
// visible to this device:
bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
if (isNewPeer) {
tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
thisCtx->toString().c_str(), peerCtx->toString().c_str());
am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(),
peerCrit->peerAgents());
int canAccess = 0;
if ((hipSuccess != ihipDeviceCanAccessPeer(&canAccess,thisCtx,peerCtx)) || (canAccess == 0)){
tprintf(DB_MEM, "device=%s can't access peer=%s\n",thisCtx->toString().c_str(), peerCtx->toString().c_str());
err = hipErrorInvalidDevice;
} else {
err = hipErrorPeerAccessAlreadyEnabled;
LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
// Add thisCtx to peerCtx's access list so that new allocations on peer will be made
// visible to this device:
bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
if (isNewPeer) {
tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
thisCtx->toString().c_str(), peerCtx->toString().c_str());
am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(),
peerCrit->peerAgents());
} else {
err = hipErrorPeerAccessAlreadyEnabled;
}
}
} else {
err = hipErrorInvalidDevice;
+38 -26
Féach ar an gComhad
@@ -50,7 +50,7 @@ THE SOFTWARE.
#include <iostream>
#include <sys/stat.h>
const char* hiprtcGetErrorString(hiprtcResult x)
extern "C" const char* hiprtcGetErrorString(hiprtcResult x)
{
switch (x) {
case HIPRTC_SUCCESS:
@@ -95,6 +95,21 @@ inline bool fileExists (const std::string& name) {
}
} // namespace hip_impl
namespace
{
char* demangle(const char* x)
{
if (!x) return nullptr;
int s{};
char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s);
if (s != 0) return nullptr;
return tmp;
}
} // Unnamed namespace.
namespace
{
struct Symbol {
@@ -158,7 +173,7 @@ struct _hiprtcProgram {
{
using namespace std;
char* demangled = hip_impl::demangle(name.c_str());
char* demangled = demangle(name.c_str());
name.assign(demangled == nullptr ? "" : demangled);
free(demangled);
@@ -352,7 +367,7 @@ namespace
}
} // Unnamed namespace.
hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n)
extern "C" hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n)
{
if (!n) return HIPRTC_ERROR_INVALID_INPUT;
if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -413,21 +428,6 @@ namespace
};
} // Unnamed namespace.
namespace hip_impl
{
char* demangle(const char* x)
{
if (!x) return nullptr;
int s{};
char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s);
if (s != 0) return nullptr;
return tmp;
}
} // Namespace hip_impl.
namespace
{
const std::string& defaultTarget()
@@ -492,7 +492,7 @@ namespace
}
} // Unnamed namespace.
hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
extern "C" hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
{
using namespace std;
@@ -530,7 +530,7 @@ hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
extern "C" hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
const char* name, int n, const char** hdrs,
const char** incs)
{
@@ -548,14 +548,14 @@ hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p)
extern "C" hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p)
{
if (!p) return HIPRTC_SUCCESS;
return _hiprtcProgram::destroy(*p);
}
hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
extern "C" hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
const char** ln)
{
using namespace std;
@@ -576,7 +576,7 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
extern "C" hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
{
if (!l) return HIPRTC_ERROR_INVALID_INPUT;
if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -588,7 +588,7 @@ hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
extern "C" hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
{
if (!sz) return HIPRTC_ERROR_INVALID_INPUT;
if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -599,7 +599,7 @@ hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
extern "C" hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
{
if (!c) return HIPRTC_ERROR_INVALID_INPUT;
if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -610,7 +610,7 @@ hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
return HIPRTC_SUCCESS;
}
hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)
extern "C" hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)
{
if (!sz) return HIPRTC_ERROR_INVALID_INPUT;
if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -620,3 +620,15 @@ hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)
return HIPRTC_SUCCESS;
}
extern "C" hiprtcResult hiprtcVersion(int* major, int* minor)
{
if (major == nullptr || minor == nullptr) {
return HIPRTC_ERROR_INVALID_INPUT;
}
*major = 9;
*minor = 0;
return HIPRTC_SUCCESS;
}
+11 -5
Féach ar an gComhad
@@ -26,6 +26,7 @@
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <deque>
#include <memory>
#include <mutex>
#include <stdexcept>
@@ -202,7 +203,7 @@ public:
std::function<void(hsa_code_object_reader_t*)>>;
std::pair<
std::mutex,
std::vector<std::pair<std::string, RAII_code_reader>>> code_readers;
std::deque<std::pair<std::string, RAII_code_reader>>> code_readers;
program_state_impl() {
// Create placeholder for each agent for the per-agent members.
@@ -244,7 +245,8 @@ public:
if (!valid(tmp)) break;
for (auto&& bundle : bundles(tmp)) {
impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
if(bundle.blob.size())
impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
}
blob_it += tmp.bundled_code_size;
@@ -418,13 +420,17 @@ public:
decltype(code_readers.second)::iterator it;
{
std::lock_guard<std::mutex> lck{code_readers.first};
it = code_readers.second.emplace(code_readers.second.end(),
move(file), move(tmp));
code_readers.second.emplace_back(move(file), move(tmp));
it = std::prev(code_readers.second.end());
}
auto check_hsa_error = [](hsa_status_t s) {
if (s != HSA_STATUS_SUCCESS) {
hip_throw(std::runtime_error{"error when loading code object"});
const char* hsa_err_msg;
hsa_status_string(s, &hsa_err_msg);
hip_throw(std::runtime_error{
std::string("error when loading code object: ") +
hsa_err_msg});
}
};
+41 -31
Féach ar an gComhad
@@ -34,42 +34,52 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA
using namespace std;
const string directed_dir = "directed_tests" + string(PATH_SEPERATOR_STR) + "hipEnvVar";
const string dir = "." + string(PATH_SEPERATOR_STR) + "hipEnvVar";
const string directed_dir = string(".") + PATH_SEPERATOR_STR + "directed_tests" + PATH_SEPERATOR_STR + "hipEnvVar";
const string dir = string(".") + PATH_SEPERATOR_STR + "hipEnvVar";
int getDeviceNumber() {
char buff[512];
std::this_thread::sleep_for(std::chrono::milliseconds(10));
FILE* in = popen((directed_dir + " -c").c_str(), "r");
if(fgets(buff, 512, in) == NULL){
pclose(in);
//Check at same level
in = popen((dir + " -c").c_str(), "r");
int readHipEnvVar(string flags, char* buff){
std::cout << "\nFinding hipEnvVar in " << directed_dir << "...\n";
FILE* directed_in = popen((directed_dir + flags).c_str(), "r");
if(fgets(buff, 512, directed_in) == NULL){
std::cout << "Finding hipEnvVar in " << dir << "...\n";
FILE* in = popen((dir + flags).c_str(), "r");
if(fgets(buff, 512, in) == NULL){
pclose(directed_in);
pclose(in);
return 1;
}
pclose(in);
}
std::cout << "hipEnvVar Found!\n";
pclose(directed_in);
return 0;
}
int getDeviceNumber(bool print_err=true) {
char buff[512];
std::this_thread::sleep_for(std::chrono::milliseconds(10));
if (readHipEnvVar(string(" -c"), buff)){
strncpy(buff, "1", 512);
if (print_err){
std::cerr << "The system cannot find hipEnvVar, using 1 as number of devices\n";
}
}
if (print_err) {
std::cout << buff;
}
cout << buff;
pclose(in);
return atoi(buff);
}
// Query the current device ID remotely to hipEnvVar
void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
FILE* in = popen((directed_dir + " -d " + std::to_string(deviceID)).c_str(), "r");
if(fgets(pciBusID, 100, in) == NULL){
pclose(in);
//Check at same level
in = popen((dir + " -d").c_str(), "r");
if(fgets(pciBusID, 100, in) == NULL){
pclose(in);
return;
}
if (readHipEnvVar((" -d " + std::to_string(deviceID)), pciBusID)){
std::cerr << "The system cannot find hipEnvVar\n";
}
cout << pciBusID;
pclose(in);
return;
}
@@ -78,15 +88,15 @@ void getDevicePCIBusNum(int deviceID, char* pciBusID) {
hipDevice_t deviceT;
hipDeviceGet(&deviceT, deviceID);
memset(pciBusID, 0, 100);
hipDeviceGetPCIBusId(pciBusID, 100, deviceT);
memset(pciBusID, 0, 512);
hipDeviceGetPCIBusId(pciBusID, 512, deviceT);
}
int main() {
unsetenv(HIP_VISIBLE_DEVICES_STR);
unsetenv(CUDA_VISIBLE_DEVICES_STR);
std::vector<std::string> devPCINum;
char pciBusID[100];
char pciBusID[512];
// collect the device pci bus ID for all devices
int totalDeviceNum = getDeviceNumber();
std::cout << "The total number of available devices is " << totalDeviceNum << std::endl
@@ -116,27 +126,27 @@ int main() {
// check when set an invalid device number
setenv("HIP_VISIBLE_DEVICES", "1000,0,1", 1);
setenv("CUDA_VISIBLE_DEVICES", "1000,0,1", 1);
assert(getDeviceNumber() == 0);
assert(getDeviceNumber(false) == 0);
if (totalDeviceNum > 2) {
setenv("HIP_VISIBLE_DEVICES", "0,1,1000,2", 1);
setenv("CUDA_VISIBLE_DEVICES", "0,1,1000,2", 1);
assert(getDeviceNumber() == 2);
assert(getDeviceNumber(false) == 2);
setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1);
setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1);
assert(getDeviceNumber() == 3);
assert(getDeviceNumber(false) == 3);
// test if CUDA_VISIBLE_DEVICES will be accepted by the runtime
unsetenv(HIP_VISIBLE_DEVICES_STR);
unsetenv(CUDA_VISIBLE_DEVICES_STR);
setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1);
assert(getDeviceNumber() == 3);
assert(getDeviceNumber(false) == 3);
}
setenv("HIP_VISIBLE_DEVICES", "-100,0,1", 1);
setenv("CUDA_VISIBLE_DEVICES", "-100,0,1", 1);
assert(getDeviceNumber() == 0);
assert(getDeviceNumber(false) == 0);
std::cout << "PASSED" << std::endl;
return 0;
}
}
-147
Féach ar an gComhad
@@ -1,147 +0,0 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#include <test_common.h>
#define HIPRTC_GET_TYPE_NAME
#include <hip/hiprtc.h>
#include <hip/hip_runtime.h>
#include <iostream>
#include <string>
#include <vector>
static constexpr auto gpu_program{
R"(
#include <hip/hip_runtime.h>
namespace N1 { struct S1_t { int i; double d; }; }
template<typename T>
__global__ void f3(int *result) { *result = sizeof(T); }
)"};
// note: this structure is also defined in GPU code string. Should ideally
// be in a header file included by both GPU code string and by CPU code.
namespace N1 { struct S1_t { int i; double d; }; };
template <typename T>
std::string getKernelNameForType(void)
{
std::string type_name;
hiprtcGetTypeName<T>(&type_name);
return std::string{"f3<"} + type_name + '>';
}
int main()
{
using namespace std;
hiprtcProgram prog;
hiprtcCreateProgram(&prog, gpu_program, "gpu_program.cu", 0, nullptr,
nullptr);
vector<string> name_vec;
vector<int> expected_result;
name_vec.push_back(getKernelNameForType<int>());
expected_result.push_back(sizeof(int));
name_vec.push_back(getKernelNameForType<double>());
expected_result.push_back(sizeof(double));
name_vec.push_back(getKernelNameForType<N1::S1_t>());
expected_result.push_back(sizeof(N1::S1_t));
for (auto&& x : name_vec) hiprtcAddNameExpression(prog, x.c_str());
hipDeviceProp_t props;
int device = 0;
hipGetDeviceProperties(&props, device);
std::string gfxName = "gfx" + std::to_string(props.gcnArch);
std::string sarg = "--gpu-architecture=" + gfxName;
const char* options[] = {
sarg.c_str()
};
hiprtcResult compileResult = hiprtcCompileProgram(prog, 1, options);
size_t logSize;
hiprtcGetProgramLogSize(prog, &logSize);
if (logSize) {
string log(logSize, '\0');
hiprtcGetProgramLog(prog, &log[0]);
cout << log << '\n';
}
if (compileResult != HIPRTC_SUCCESS) { failed("Compilation failed."); }
size_t codeSize;
hiprtcGetCodeSize(prog, &codeSize);
vector<char> code(codeSize);
hiprtcGetCode(prog, code.data());
hipModule_t module;
hipModuleLoadDataEx(&module, code.data(), 0, nullptr, nullptr);
hipDeviceptr_t dResult;
int hResult = 0;
hipMalloc(&dResult, sizeof(hResult));
hipMemcpyHtoD(dResult, &hResult, sizeof(hResult));
for (size_t i = 0; i < name_vec.size(); ++i) {
const char *name;
hiprtcGetLoweredName(prog, name_vec[i].c_str(), &name);
hipFunction_t kernel;
hipModuleGetFunction(&kernel, module, name);
struct { hipDeviceptr_t a_; } args{dResult};
auto size = sizeof(args);
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END};
hipModuleLaunchKernel(kernel,
1, 1, 1,
1, 1, 1,
0, nullptr,
nullptr, config);
hipMemcpyDtoH(&hResult, dResult, sizeof(hResult));
if (expected_result[i] != hResult) { failed("Validation failed."); }
}
hipFree(dResult);
hipModuleUnload(module);
hiprtcDestroyProgram(&prog);
passed();
}
+130
Féach ar an gComhad
@@ -0,0 +1,130 @@
/*
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <hip/hip_runtime.h>
#include "test_common.h"
#define WIDTH 4
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
// Device (Kernel) function, it must be void
template <typename T>
__global__ void matrixTranspose(T* out, T* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
T val = in[x];
for (int i = 0; i < width; i++) {
for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
}
}
// CPU implementation of matrix transpose
template <typename T>
void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
}
}
template<typename T>
void runTest() {
T* Matrix;
T* TransposeMatrix;
T* cpuTransposeMatrix;
T* gpuMatrix;
T* gpuTransposeMatrix;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
int i;
int errors;
Matrix = (T*)malloc(NUM * sizeof(T));
TransposeMatrix = (T*)malloc(NUM * sizeof(T));
cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (T)i * 10l;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(T));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(T));
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), hipMemcpyHostToDevice);
// Lauching kernel from host
hipLaunchKernelGGL(matrixTranspose<T>, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
gpuTransposeMatrix, gpuMatrix, WIDTH);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(T), hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
errors++;
}
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
if (errors != 0) {
failed("Mismatch present");
}
}
int main() {
runTest<int>();
runTest<float>();
runTest<long>();
runTest<long long>();
passed();
}
+102
Féach ar an gComhad
@@ -0,0 +1,102 @@
/*
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <hip/hip_runtime.h>
#include "test_common.h"
template <typename T>
__global__ void shflDownSum(T* a, int size) {
T val = a[threadIdx.x];
for (int i = size / 2; i > 0; i /= 2) {
val += __shfl_down(val, i, size);
}
a[threadIdx.x] = val;
}
template <typename T>
__global__ void shflUpSum(T* a, int size) {
T val = a[threadIdx.x];
for (int i = size / 2; i > 0; i /= 2) {
val += __shfl_up(val, i, size);
}
a[threadIdx.x] = val;
}
template <typename T>
void runTestShflUp() {
const int size = 32;
T a[size];
T cpuSum = 0;
for (int i = 0; i < size; i++) {
a[i] = i;
cpuSum += a[i];
}
T* d_a;
hipMalloc(&d_a, sizeof(T) * size);
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[size - 1] != cpuSum) {
hipFree(d_a);
failed("Shfl Up Sum did not match.");
}
hipFree(d_a);
}
template <typename T>
void runTestShflDown() {
const int size = 32;
T a[size];
T cpuSum = 0;
for (int i = 0; i < size; i++) {
a[i] = i;
cpuSum += a[i];
}
T* d_a;
hipMalloc(&d_a, sizeof(T) * size);
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[0] != cpuSum) {
hipFree(d_a);
failed("Shfl Up Sum did not match.");
}
hipFree(d_a);
}
int main() {
runTestShflUp<int>();
runTestShflUp<float>();
runTestShflUp<long>();
runTestShflUp<long long>();
runTestShflDown<int>();
runTestShflDown<float>();
runTestShflDown<long>();
runTestShflDown<long long>();
passed();
}
+110
Féach ar an gComhad
@@ -0,0 +1,110 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include "test_common.h"
template <typename T>
void runTest(int width,int height,int depth, hipChannelFormatKind formatKind)
{
unsigned int size = width * height * depth * sizeof(T);
T* hData = (T*) malloc(size);
memset(hData, 0, size);
for (int i = 0; i < depth; i++) {
for (int j = 0; j < height; j++) {
for (int k = 0; k < width; k++) {
hData[i*width*height + j*width +k] = i*width*height + j*width + k;
}
}
}
printf("test- sizeof(T) =%d\n", sizeof(T));
hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8, 0, 0, 0, formatKind);
hipArray *arr,*arr1;
HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
HIPCHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
hipMemcpy3DParms myparms = {0};
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
myparms.dstArray = arr;
myparms.extent = make_hipExtent(width , height, depth);
#ifdef __HIP_PLATFORM_NVCC__
myparms.kind = cudaMemcpyHostToDevice;
#else
myparms.kind = hipMemcpyHostToDevice;
#endif
HIPCHECK(hipMemcpy3D(&myparms));
HIPCHECK(hipDeviceSynchronize());
//Array to Array
memset(&myparms,0x0, sizeof(hipMemcpy3DParms));
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.srcArray = arr;
myparms.dstArray = arr1;
myparms.extent = make_hipExtent(width, height, depth);
#ifdef __HIP_PLATFORM_NVCC__
myparms.kind = cudaMemcpyDeviceToDevice;
#else
myparms.kind = hipMemcpyDeviceToDevice;
#endif
HIPCHECK(hipMemcpy3D(&myparms));
HIPCHECK(hipDeviceSynchronize());
T *hOutputData = (T*) malloc(size);
memset(hOutputData, 0, size);
//Device to host
memset(&myparms,0x0, sizeof(hipMemcpy3DParms));
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(T), width, height);
myparms.srcArray = arr1;
myparms.extent = make_hipExtent(width, height, depth);
#ifdef __HIP_PLATFORM_NVCC__
myparms.kind = cudaMemcpyDeviceToHost;
#else
myparms.kind = hipMemcpyDeviceToHost;
#endif
HIPCHECK(hipMemcpy3D(&myparms));
HIPCHECK(hipDeviceSynchronize());
// Check result
HipTest::checkArray(hData,hOutputData,width,height,depth);
hipFreeArray(arr);
hipFreeArray(arr1);
free(hData);
free(hOutputData);
}
int main(int argc, char **argv)
{
for(int i=1;i<25;i++)
{
runTest<float>(i,i,i, hipChannelFormatKindFloat);
runTest<int>(i+1,i,i, hipChannelFormatKindSigned);
runTest<char>(i,i+1,i, hipChannelFormatKindSigned);
}
passed();
}
@@ -0,0 +1,152 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* TEST: %t
* HIT_END
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <chrono>
#include "test_common.h"
#define LEN 64
#define SIZE LEN << 2
#define THREADS 2
#define MAX_THREADS 16
#define FILENAME "vcpy_kernel.code"
#define kernel_name "hello_world"
std::vector<char> load_file()
{
std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
std::streamsize fsize = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(fsize);
if (!file.read(buffer.data(), fsize)) {
failed("could not open code object '%s'\n", FILENAME);
}
return buffer;
}
void run(const std::vector<char>& buffer) {
hipDevice_t device;
HIPCHECK(hipDeviceGet(&device, 0));
hipCtx_t context;
HIPCHECK(hipCtxCreate(&context, 0, device));
hipModule_t Module;
hipFunction_t Function;
HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
float *A, *B, *Ad, *Bd;
A = new float[LEN];
B = new float[LEN];
for (uint32_t i = 0; i < LEN; i++) {
A[i] = i * 1.0f;
B[i] = 0.0f;
}
HIPCHECK(hipMalloc((void**)&Ad, SIZE));
HIPCHECK(hipMalloc((void**)&Bd, SIZE));
HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
struct {
void* _Ad;
void* _Bd;
} args;
args._Ad = (void*) Ad;
args._Bd = (void*) Bd;
size_t size = sizeof(args);
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END};
HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config));
HIPCHECK(hipStreamDestroy(stream));
HIPCHECK(hipModuleUnload(Module));
HIPCHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
for (uint32_t i = 0; i < LEN; i++) {
assert(A[i] == B[i]);
}
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
hipCtxDestroy(context);
}
struct joinable_thread : std::thread
{
template <class... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
{
}
joinable_thread& operator=(joinable_thread&& other) = default;
joinable_thread(joinable_thread&& other) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
void run_multi_threads(uint32_t n, const std::vector<char>& buffer) {
std::vector<joinable_thread> threads;
for (uint32_t i = 0; i < n; i++) {
threads.emplace_back(std::thread{[&, buffer] {
run(buffer);
}});
}
}
int main() {
HIPCHECK(hipInit(0));
auto buffer = load_file();
run_multi_threads(min(THREADS * std::thread::hardware_concurrency(), MAX_THREADS), buffer);
passed();
}
+2
Féach ar an gComhad
@@ -37,10 +37,12 @@ int p_tests = -1; /*which tests to run. Interpretation is left to each test. de
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES=";
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES=";
const char* PATH_SEPERATOR_STR = "\\";
const char* NULL_DEVICE = "NUL:";
#else
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES";
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES";
const char* PATH_SEPERATOR_STR = "/";
const char* NULL_DEVICE = "/dev/null";
#endif
namespace HipTest {
+5
Féach ar an gComhad
@@ -105,6 +105,10 @@ THE SOFTWARE.
#define pclose(x) _pclose(x)
#define setenv(x,y,z) _putenv_s(x,y)
#define unsetenv _putenv
#define fileno(x) _fileno(x)
#define dup(x) _dup(x)
#define dup2(x,y) _dup2(x,y)
#define close(x) _close(x)
#else
#define aligned_free(x) free(x)
#endif
@@ -124,6 +128,7 @@ extern int p_tests;
extern const char* HIP_VISIBLE_DEVICES_STR;
extern const char* CUDA_VISIBLE_DEVICES_STR;
extern const char* PATH_SEPERATOR_STR;
extern const char* NULL_DEVICE;
// ********************* CPP section *********************
#ifdef __cplusplus
+2 -2
Féach ar an gComhad
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
* TEST: %t
* HIT_END
*/
@@ -66,7 +66,7 @@ void runTest(int width,int height,int num_layers,texture<T, hipTextureType2DLaye
myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
myparms.dstArray = arr;
myparms.extent = make_hipExtent(width , height, num_layers);
myparms.kind = hipMemcpyHostToDevice;
//myparms.kind = hipMemcpyHostToDevice;
HIPCHECK(hipMemcpy3D(&myparms));
// set texture parameters
+2 -3
Féach ar an gComhad
@@ -84,10 +84,10 @@ void runTest(int width,int height,int depth,texture<T, hipTextureType3D, hipRead
}
// Allocate array and copy image data
hipChannelFormatDesc channelDesc = tex->channelDesc;
hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8, 0, 0, 0, hipChannelFormatKindSigned);
hipArray *arr;
HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayCubemap));
hipMemcpy3DParms myparms = {0};
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
@@ -100,7 +100,6 @@ void runTest(int width,int height,int depth,texture<T, hipTextureType3D, hipRead
// set texture parameters
tex->addressMode[0] = hipAddressModeWrap;
tex->addressMode[1] = hipAddressModeWrap;
tex->addressMode[2] = hipAddressModeWrap;
tex->filterMode = hipFilterModePoint;
tex->normalized = false;