Merge branch 'master' into implicit-kernarg

Этот коммит содержится в:
Maneesh Gupta
2019-06-04 13:24:19 +05:30
коммит произвёл GitHub
родитель 214ec53da3 48e657b099
Коммит d4fa74ff09
31 изменённых файлов: 888 добавлений и 290 удалений
+15
Просмотреть файл
@@ -271,6 +271,21 @@ if(HIP_PLATFORM STREQUAL "hcc")
target_link_libraries(hiprtc PUBLIC stdc++fs)
endif()
if(HIP_PLATFORM STREQUAL "hcc")
find_package(amd_comgr REQUIRED CONFIG
PATHS
/opt/rocm/
PATH_SUFFIXES
cmake/amd_comgr
lib/cmake/amd_comgr
)
MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
endif()
target_link_libraries(hip_hcc PRIVATE amd_comgr)
target_link_libraries(hip_hcc_static PRIVATE amd_comgr)
string(REPLACE " " ";" HCC_CXX_FLAGS_LIST ${HCC_CXX_FLAGS})
foreach(TARGET hip_hcc hip_hcc_static)
target_include_directories(${TARGET} SYSTEM INTERFACE $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>;${HSA_PATH}/include)
+33 -26
Просмотреть файл
@@ -107,24 +107,25 @@ $HIP_RUNTIME= $hipConfig{'HIP_RUNTIME'};
# If using VDI runtime, need to find HIP_VDI_HOME
if ($HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) {
$HIP_VDI_HOME = "/opt/rocm/hip"
my $hipcc_dir = dirname($0);
if (-e "$hipcc_dir/../lib/bitcode") {
$HIP_VDI_HOME = abs_path($hipcc_dir . "/..");
} else {
$HIP_VDI_HOME = "/opt/rocm/hip";
}
}
if (defined $HIP_VDI_HOME) {
my $bits = "";
if (-d "$HIP_VDI_HOME/bin/x86_64") {
$bits = "/x86_64";
if (!defined $HIP_CLANG_PATH and (-e "$HIP_VDI_HOME/bin/clang" or -e "$HIP_VDI_HOME/bin/clang.exe")) {
$HIP_CLANG_PATH = "$HIP_VDI_HOME/bin";
$HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang";
}
if (!defined $HIP_CLANG_PATH) {
$HIP_CLANG_PATH = "$HIP_VDI_HOME/bin" . $bits;
if (!defined $DEVICE_LIB_PATH and -e "$HIP_VDI_HOME/lib/bitcode") {
$DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib/bitcode";
}
if (!defined $DEVICE_LIB_PATH) {
$DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits . "/bitcode";
}
$HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang";
$HIP_INCLUDE_PATH = "$HIP_VDI_HOME/include";
if (!defined $HIP_LIB_PATH) {
$HIP_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits;
$HIP_LIB_PATH = "$HIP_VDI_HOME/lib";
}
}
@@ -169,7 +170,7 @@ if ($HIP_PLATFORM eq "clang") {
$HIP_CLANG_VERSION=$1;
if (! defined $HIP_CLANG_INCLUDE_PATH) {
$HIP_CLANG_INCLUDE_PATH = "$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include";
$HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
}
if (! defined $HIP_INCLUDE_PATH) {
$HIP_INCLUDE_PATH = "$HIP_PATH/include";
@@ -370,7 +371,7 @@ if($HIP_PLATFORM eq "nvcc"){
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
my $optArg = ""; # -O args
my $rdc = 0;
my $gArg = ""; # -g args
foreach $arg (@ARGV)
{
@@ -466,13 +467,9 @@ foreach $arg (@ARGV)
{
$optArg = $arg;
}
if($arg =~ /-fgpu-rdc/)
if($arg =~ m/^-g/)
{
$rdc = 1;
}
if($arg =~ /-fno-gpu-rdc/)
{
$rdc = 0;
$gArg = $arg;
}
## process linker response file for hip-clang
@@ -799,16 +796,12 @@ if ($needHipHcc) {
if ($HIP_PLATFORM eq "clang") {
# Set default optimization level to -O3 for hip-clang.
if ($optArg eq "") {
if ($optArg eq "" and $gArg ne "-g") {
$HIPCXXFLAGS .= " -O3";
$HIPLDFLAGS .= " -O3";
}
$HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH";
if ($rdc eq 0) {
$HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
} else {
$HIPLDFLAGS .= " $HIP_DEVLIB_FLAGS";
}
$HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
if ($isWindows) {
$HIPCXXFLAGS .= " -std=c++14 -fms-extensions -fms-compatibility";
} else {
@@ -848,7 +841,21 @@ if ($runCmd) {
print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using $HCC_HOME/hcc with version $HCC_VERSION from hipcc. Please rebuild HIP including cmake or update HCC_HOME variable.\n") ;
die unless $ENV{'HIP_IGNORE_HCC_VERSION'};
}
system ("$CMD") or delete_temp_dirs () and die ();
system ("$CMD");
if ($? == -1) {
print "failed to execute: $!\n";
exit($?);
}
elsif ($? & 127) {
printf "child died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit($?);
}
else {
$CMD_EXIT_CODE = $? >> 8;
}
$? or delete_temp_dirs ();
exit($CMD_EXIT_CODE);
}
# vim: ts=4:sw=4:expandtab:smartindent
+1 -1
Просмотреть файл
@@ -861,7 +861,7 @@
| `cuMemAllocManaged` | |
| `cuMemAllocPitch` | |
| `cuMemcpy` | |
| `cuMemcpy2D` | |
| `cuMemcpy2D` | `hipMemcpyParam2D` |
| `cuMemcpy2DAsync` | |
| `cuMemcpy2DUnaligned` | |
| `cuMemcpy3D` | |
+113
Просмотреть файл
@@ -246,6 +246,99 @@
| enum |***`cudnnWgradMode_t`*** | |
| 0 |*`CUDNN_WGRAD_MODE_ADD`* | |
| 1 |*`CUDNN_WGRAD_MODE_SET`* | |
| enum |***`cudnnReorderType_t`*** | |
| 0 |*`CUDNN_DEFAULT_REORDER`* | |
| 1 |*`CUDNN_NO_REORDER`* | |
| enum |***`cudnnLossNormalizationMode_t`*** | |
| 0 |*`CUDNN_LOSS_NORMALIZATION_NONE`* | |
| 1 |*`CUDNN_LOSS_NORMALIZATION_SOFTMAX`* | |
| struct |`cudnnFusedOpsConstParamStruct` | |
| struct* |`cudnnFusedOpsConstParamPack_t` | |
| struct |`cudnnFusedOpsVariantParamStruct` | |
| struct* |`cudnnFusedOpsVariantParamPack_t` | |
| struct |`cudnnFusedOpsPlanStruct` | |
| struct* |`cudnnFusedOpsPlan_t` | |
| enum |***`cudnnFusedOps_t`*** | |
| 0 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS`* | |
| 1 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD`* | |
| 2 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING`* | |
| 3 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE`* | |
| 4 |*`CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION`* | |
| 5 |*`CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK`* | |
| 6 |*`CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM`* | |
| enum |***`cudnnFusedOpsConstParamLabel_t`*** | |
| 0 |*`CUDNN_PARAM_XDESC`* | |
| 1 |*`CUDNN_PARAM_XDATA_PLACEHOLDER`* | |
| 2 |*`CUDNN_PARAM_BN_MODE`* | |
| 3 |*`CUDNN_PARAM_BN_EQSCALEBIAS_DESC`* | |
| 4 |*`CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER`* | |
| 5 |*`CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER`* | |
| 6 |*`CUDNN_PARAM_ACTIVATION_DESC`* | |
| 7 |*`CUDNN_PARAM_CONV_DESC`* | |
| 8 |*`CUDNN_PARAM_WDESC`* | |
| 9 |*`CUDNN_PARAM_WDATA_PLACEHOLDER`* | |
| 10 |*`CUDNN_PARAM_DWDESC`* | |
| 11 |*`CUDNN_PARAM_DWDATA_PLACEHOLDER`* | |
| 12 |*`CUDNN_PARAM_YDESC`* | |
| 13 |*`CUDNN_PARAM_YDATA_PLACEHOLDER`* | |
| 14 |*`CUDNN_PARAM_DYDESC`* | |
| 15 |*`CUDNN_PARAM_DYDATA_PLACEHOLDER`* | |
| 16 |*`CUDNN_PARAM_YSTATS_DESC`* | |
| 17 |*`CUDNN_PARAM_YSUM_PLACEHOLDER`* | |
| 18 |*`CUDNN_PARAM_YSQSUM_PLACEHOLDER`* | |
| 19 |*`CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC`* | |
| 20 |*`CUDNN_PARAM_BN_SCALE_PLACEHOLDER`* | |
| 21 |*`CUDNN_PARAM_BN_BIAS_PLACEHOLDER`* | |
| 22 |*`CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER`* | |
| 23 |*`CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER`* | |
| 24 |*`CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER`* | |
| 25 |*`CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER`* | |
| 26 |*`CUDNN_PARAM_ZDESC`* | |
| 27 |*`CUDNN_PARAM_ZDATA_PLACEHOLDER`* | |
| 28 |*`CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC`* | |
| 29 |*`CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER`* | |
| 30 |*`CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER`* | |
| 31 |*`CUDNN_PARAM_ACTIVATION_BITMASK_DESC`* | |
| 32 |*`CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER`* | |
| 33 |*`CUDNN_PARAM_DXDESC`* | |
| 34 |*`CUDNN_PARAM_DXDATA_PLACEHOLDER`* | |
| 35 |*`CUDNN_PARAM_DZDESC`* | |
| 36 |*`CUDNN_PARAM_DZDATA_PLACEHOLDER`* | |
| 37 |*`CUDNN_PARAM_BN_DSCALE_PLACEHOLDER`* | |
| 38 |*`CUDNN_PARAM_BN_DBIAS_PLACEHOLDER`* | |
| enum |***`cudnnFusedOpsPointerPlaceHolder_t`*** | |
| 0 |*`CUDNN_PTR_NULL`* | |
| 1 |*`CUDNN_PTR_ELEM_ALIGNED`* | |
| 2 |*`CUDNN_PTR_16B_ALIGNED`* | |
| enum |***`cudnnFusedOpsVariantParamLabel_t`*** | |
| 0 |*`CUDNN_PTR_XDATA`* | |
| 1 |*`CUDNN_PTR_BN_EQSCALE`* | |
| 2 |*`CUDNN_PTR_BN_EQBIAS`* | |
| 3 |*`CUDNN_PTR_WDATA`* | |
| 4 |*`CUDNN_PTR_DWDATA`* | |
| 5 |*`CUDNN_PTR_YDATA`* | |
| 6 |*`CUDNN_PTR_DYDATA`* | |
| 7 |*`CUDNN_PTR_YSUM`* | |
| 8 |*`CUDNN_PTR_YSQSUM`* | |
| 9 |*`CUDNN_PTR_WORKSPACE`* | |
| 10 |*`CUDNN_PTR_BN_SCALE`* | |
| 11 |*`CUDNN_PTR_BN_BIAS`* | |
| 12 |*`CUDNN_PTR_BN_SAVED_MEAN`* | |
| 13 |*`CUDNN_PTR_BN_SAVED_INVSTD`* | |
| 14 |*`CUDNN_PTR_BN_RUNNING_MEAN`* | |
| 15 |*`CUDNN_PTR_BN_RUNNING_VAR`* | |
| 16 |*`CUDNN_PTR_ZDATA`* | |
| 17 |*`CUDNN_PTR_BN_Z_EQSCALE`* | |
| 18 |*`CUDNN_PTR_BN_Z_EQBIAS`* | |
| 19 |*`CUDNN_PTR_ACTIVATION_BITMASK`* | |
| 20 |*`CUDNN_PTR_DXDATA`* | |
| 21 |*`CUDNN_PTR_DZDATA`* | |
| 22 |*`CUDNN_PTR_BN_DSCALE`* | |
| 23 |*`CUDNN_PTR_BN_DBIAS`* | |
| 100 |*`CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES`* | |
| 101 |*`CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT`* | |
| 102 |*`CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR`* | |
| 103 |*`CUDNN_SCALAR_DOUBLE_BN_EPSILON`* | |
## **2. CUDNN API functions**
@@ -282,6 +375,7 @@
|`cudnnGetOpTensorDescriptor` |`hipdnnGetOpTensorDescriptor` |
|`cudnnDestroyOpTensorDescriptor` |`hipdnnDestroyOpTensorDescriptor` |
|`cudnnOpTensor` |`hipdnnOpTensor` |
|`cudnnGetFoldedConvBackwardDataDescriptors` | |
|`cudnnCreateReduceTensorDescriptor` |`hipdnnCreateReduceTensorDescriptor` |
|`cudnnSetReduceTensorDescriptor` |`hipdnnSetReduceTensorDescriptor` |
|`cudnnGetReduceTensorDescriptor` |`hipdnnGetReduceTensorDescriptor` |
@@ -296,12 +390,17 @@
|`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` |
|`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` |
|`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` |
|`cudnnGetFilterSizeInBytes` | |
|`cudnnTransformFilter` | |
|`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` |
|`cudnnReorderFilterAndBias` | |
|`cudnnCreateConvolutionDescriptor` |`hipdnnCreateConvolutionDescriptor` |
|`cudnnSetConvolutionMathType` |`hipdnnSetConvolutionMathType` |
|`cudnnGetConvolutionMathType` | |
|`cudnnSetConvolutionGroupCount` |`hipdnnSetConvolutionGroupCount` |
|`cudnnGetConvolutionGroupCount` | |
|`cudnnSetConvolutionReorderType` | |
|`cudnnGetConvolutionReorderType` | |
|`cudnnSetConvolution2dDescriptor` |`hipdnnSetConvolution2dDescriptor` |
|`cudnnGetConvolution2dDescriptor` |`hipdnnGetConvolution2dDescriptor` |
|`cudnnGetConvolution2dForwardOutputDim` |`hipdnnGetConvolution2dForwardOutputDim` |
@@ -424,7 +523,9 @@
|`cudnnGetRNNBiasMode` | |
|`cudnnCreateCTCLossDescriptor` | |
|`cudnnSetCTCLossDescriptor` | |
|`cudnnSetCTCLossDescriptorEx` | |
|`cudnnGetCTCLossDescriptor` | |
|`cudnnGetCTCLossDescriptorEx` | |
|`cudnnDestroyCTCLossDescriptor` | |
|`cudnnCTCLoss` | |
|`cudnnGetCTCLossWorkspaceSize` | |
@@ -462,3 +563,15 @@
|`cudnnMultiHeadAttnForward` | |
|`cudnnMultiHeadAttnBackwardData` | |
|`cudnnMultiHeadAttnBackwardWeights` | |
|`cudnnCreateFusedOpsConstParamPack` | |
|`cudnnDestroyFusedOpsConstParamPack` | |
|`cudnnSetFusedOpsConstParamPackAttribute` | |
|`cudnnGetFusedOpsConstParamPackAttribute` | |
|`cudnnCreateFusedOpsVariantParamPack` | |
|`cudnnDestroyFusedOpsVariantParamPack` | |
|`cudnnSetFusedOpsVariantParamPackAttribute` | |
|`cudnnGetFusedOpsVariantParamPackAttribute` | |
|`cudnnCreateFusedOpsPlan` | |
|`cudnnDestroyFusedOpsPlan` | |
|`cudnnMakeFusedOpsPlan` | |
|`cudnnFusedOpsExecute` | |
+67
Просмотреть файл
@@ -35,6 +35,9 @@
- [Warp Cross-Lane Functions](#warp-cross-lane-functions)
* [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions)
* [Warp Shuffle Functions](#warp-shuffle-functions)
- [Cooperative Groups Functions](#cooperative-groups-functions)
- [Warp Matrix Functions](#warp-matrix-functions)
- [Independent Thread Scheduling](#independent-thread-scheduling)
- [Profiler Counter Function](#profiler-counter-function)
- [Assert](#assert)
- [Printf](#printf)
@@ -599,6 +602,70 @@ float __shfl_xor (float var, int laneMask, int width=warpSize);
```
## Cooperative Groups Functions
Cooperative groups is a mechanism for forming and communicating between groups of threads at
a granularity different than the block. This feature was introduced in Cuda 9.
HIP does not support any of the kernel language cooperative groups
types or functions.
| **Function** | **Supported in HIP** | **Supported in CUDA** |
| --- | --- | --- |
| `void thread_group.sync()` | | ✓ |
| `unsigned thread_group.size()` | | ✓ |
| `unsigned thread_group.thread_rank()` | | ✓ |
| `bool thread_group.is_valid()` | | ✓ |
| `thread_group tiled_partition(thread_group, size)` | | ✓ |
| `thread_block_tile<N> tiled_partition<N>(thread_group)` | | ✓ |
| `thread_block this_thread_block()` | | ✓ |
| `T thread_block_tile.shfl()` | | ✓ |
| `T thread_block_tile.shfl_down()` | | ✓ |
| `T thread_block_tile.shfl_up()` | | ✓ |
| `T thread_block_tile.shfl_xor()` | | ✓ |
| `T thread_block_tile.any()` | | ✓ |
| `T thread_block_tile.all()` | | ✓ |
| `T thread_block_tile.ballot()` | | ✓ |
| `T thread_block_tile.match_any()` | | ✓ |
| `T thread_block_tile.match_all()` | | ✓ |
| `coalesced_group coalesced_threads()` | | ✓ |
| `grid_group this_grid()` | | ✓ |
| `void grid_group.sync()` | | ✓ |
| `unsigned grid_group.size()` | | ✓ |
| `unsigned grid_group.thread_rank()` | | ✓ |
| `bool grid_group.is_valid()` | | ✓ |
| `multi_grid_group this_multi_grid()` | | ✓ |
| `void multi_grid_group.sync()` | | ✓ |
| `unsigned multi_grid_group.size()` | | ✓ |
| `unsigned multi_grid_group.thread_rank()` | | ✓ |
| `bool multi_grid_group.is_valid()` | | ✓ |
## Warp Matrix Functions
Warp matrix functions allow a warp to cooperatively operate on small matrices
whose elements are spread over the lanes in an unspecified manner. This feature
was introduced in Cuda 9.
HIP does not support any of the kernel language warp matrix
types or functions.
| **Function** | **Supported in HIP** | **Supported in CUDA** |
| --- | --- | --- |
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)` | | ✓ |
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout)` | | ✓ |
| `void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)` | | ✓ |
| `void fill_fragment(fragment<...> &a, const T &value)` | | ✓ |
| `void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c , bool sat)` | | ✓ |
## Independent Thread Scheduling
The hardware support for independent thread scheduling introduced in certain architectures
supporting Cuda allows threads to progress independently of each other and enables
intra-warp synchronizations that were previously not allowed.
HIP does not support this type of scheduling.
## Profiler Counter Function
The Cuda `__prof_trigger()` instruction is not supported.
+8 -8
Просмотреть файл
@@ -142,9 +142,9 @@ To run it:
* Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option:
- Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10`
- Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64`
- Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10`
- Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64`
5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
@@ -178,9 +178,9 @@ To run it:
On Linux the following configurations are tested:
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.5.1.10
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.0.64
Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.5.1.10
Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.6.0.64
Build system for the above configurations:
@@ -195,7 +195,7 @@ cmake
-DCMAKE_INSTALL_PREFIX=../dist \
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/8.0.0/dist \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64 \
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/8.0.0/build/bin/llvm-lit \
..
```
@@ -311,9 +311,9 @@ On Windows 10 the following configurations are tested:
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.5.1.10
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.0.64
LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.5.1.10
LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.6.0.64
Build system for the above configurations:
@@ -330,7 +330,7 @@ cmake
-DCMAKE_PREFIX_PATH=f:/LLVM/6.0.1/dist \
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0" \
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v9.0" \
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10 \
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64 \
-DLLVM_EXTERNAL_LIT=f:/LLVM/6.0.1/build/Release/bin/llvm-lit.py \
-Thost=x64
..
+1
Просмотреть файл
@@ -33,6 +33,7 @@ const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
{"driver_types.h", {"hip/driver_types.h", "", CONV_INCLUDE, API_RUNTIME}},
{"cuda_fp16.h", {"hip/hip_fp16.h", "", CONV_INCLUDE, API_RUNTIME}},
{"cuda_texture_types.h", {"hip/hip_texture_types.h", "", CONV_INCLUDE, API_RUNTIME}},
{"texture_fetch_functions.h", {"", "", CONV_INCLUDE, API_RUNTIME}},
{"vector_types.h", {"hip/hip_vector_types.h", "", CONV_INCLUDE, API_RUNTIME}},
{"cuda_profiler_api.h", {"hip/hip_profile.h", "", CONV_INCLUDE, API_RUNTIME}},
// cuComplex includes
+22
Просмотреть файл
@@ -61,6 +61,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
{"cudnnGetOpTensorDescriptor", {"hipdnnGetOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnDestroyOpTensorDescriptor", {"hipdnnDestroyOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnOpTensor", {"hipdnnOpTensor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetFoldedConvBackwardDataDescriptors", {"hipdnnGetFoldedConvBackwardDataDescriptors", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
// cuDNN Reduce Tensor functions
{"cudnnCreateReduceTensorDescriptor", {"hipdnnCreateReduceTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
@@ -79,7 +80,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
{"cudnnGetFilter4dDescriptor", {"hipdnnGetFilter4dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnSetFilterNdDescriptor", {"hipdnnSetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetFilterNdDescriptor", {"hipdnnGetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetFilterSizeInBytes", {"hipdnnGetFilterSizeInBytes", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnTransformFilter", {"hipdnnTransformFilter", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnReorderFilterAndBias", {"hipdnnReorderFilterAndBias", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
// cuDNN Convolution functions
{"cudnnCreateConvolutionDescriptor", {"hipdnnCreateConvolutionDescriptor", "", CONV_LIB_FUNC, API_DNN}},
@@ -87,6 +91,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
{"cudnnGetConvolutionMathType", {"hipdnnGetConvolutionMathType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetConvolutionGroupCount", {"hipdnnSetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetConvolutionGroupCount", {"hipdnnGetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetConvolutionReorderType", {"hipdnnSetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetConvolutionReorderType", {"hipdnnGetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetConvolution2dDescriptor", {"hipdnnSetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetConvolution2dDescriptor", {"hipdnnGetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
{"cudnnGetConvolution2dForwardOutputDim", {"hipdnnGetConvolution2dForwardOutputDim", "", CONV_LIB_FUNC, API_DNN}},
@@ -235,7 +241,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
// cuDNN Connectionist Temporal Classification loss functions
{"cudnnCreateCTCLossDescriptor", {"hipdnnCreateCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetCTCLossDescriptor", {"hipdnnSetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetCTCLossDescriptorEx", {"hipdnnSetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetCTCLossDescriptor", {"hipdnnGetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetCTCLossDescriptorEx", {"hipdnnGetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnDestroyCTCLossDescriptor", {"hipdnnDestroyCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnCTCLoss", {"hipdnnCTCLoss", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetCTCLossWorkspaceSize", {"hipdnnGetCTCLossWorkspaceSize", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
@@ -274,4 +282,18 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
{"cudnnMultiHeadAttnForward", {"hipdnnMultiHeadAttnForward", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnMultiHeadAttnBackwardData", {"hipdnnMultiHeadAttnBackwardData", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnMultiHeadAttnBackwardWeights", {"hipdnnMultiHeadAttnBackwardWeights", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
// cuDNN Fuse functions
{"cudnnCreateFusedOpsConstParamPack", {"hipdnnCreateFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnDestroyFusedOpsConstParamPack", {"hipdnnDestroyFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetFusedOpsConstParamPackAttribute", {"hipdnnSetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetFusedOpsConstParamPackAttribute", {"hipdnnGetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnCreateFusedOpsVariantParamPack", {"hipdnnCreateFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnDestroyFusedOpsVariantParamPack", {"hipdnnDestroyFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnSetFusedOpsVariantParamPackAttribute", {"hipdnnSetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnGetFusedOpsVariantParamPackAttribute", {"hipdnnGetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnCreateFusedOpsPlan", {"hipdnnCreateFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnDestroyFusedOpsPlan", {"hipdnnDestroyFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnMakeFusedOpsPlan", {"hipdnnMakeFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsExecute", {"hipdnnFusedOpsExecute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
};
+93 -1
Просмотреть файл
@@ -240,6 +240,93 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_TYPE_NAME_MAP{
{"cudnnWgradMode_t", {"hipdnnWgradMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_WGRAD_MODE_ADD", {"HIPDNN_WGRAD_MODE_ADD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_WGRAD_MODE_SET", {"HIPDNN_WGRAD_MODE_SET", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"cudnnReorderType_t", {"hipdnnReorderType_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_DEFAULT_REORDER", {"HIPDNN_DEFAULT_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_NO_REORDER", {"HIPDNN_NO_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"cudnnLossNormalizationMode_t", {"hipdnnLossNormalizationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_LOSS_NORMALIZATION_NONE", {"HIPDNN_LOSS_NORMALIZATION_NONE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_LOSS_NORMALIZATION_SOFTMAX", {"HIPDNN_LOSS_NORMALIZATION_SOFTMAX", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"cudnnFusedOps_t", {"hipdnnFusedOps_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
{"CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
{"CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", {"HIPDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
{"CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", {"HIPDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
{"CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", {"HIPDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
{"cudnnFusedOpsConstParamLabel_t", {"hipdnnFusedOpsConstParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_PARAM_XDESC", {"HIPDNN_PARAM_XDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_PARAM_XDATA_PLACEHOLDER", {"HIPDNN_PARAM_XDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"CUDNN_PARAM_BN_MODE", {"HIPDNN_PARAM_BN_MODE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
{"CUDNN_PARAM_BN_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
{"CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
{"CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
{"CUDNN_PARAM_ACTIVATION_DESC", {"HIPDNN_PARAM_ACTIVATION_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
{"CUDNN_PARAM_CONV_DESC", {"HIPDNN_PARAM_CONV_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7
{"CUDNN_PARAM_WDESC", {"HIPDNN_PARAM_WDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8
{"CUDNN_PARAM_WDATA_PLACEHOLDER", {"HIPDNN_PARAM_WDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9
{"CUDNN_PARAM_DWDESC", {"HIPDNN_PARAM_DWDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10
{"CUDNN_PARAM_DWDATA_PLACEHOLDER", {"HIPDNN_PARAM_DWDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11
{"CUDNN_PARAM_YDESC", {"HIPDNN_PARAM_YDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12
{"CUDNN_PARAM_YDATA_PLACEHOLDER", {"HIPDNN_PARAM_YDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13
{"CUDNN_PARAM_DYDESC", {"HIPDNN_PARAM_DYDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14
{"CUDNN_PARAM_DYDATA_PLACEHOLDER", {"HIPDNN_PARAM_DYDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15
{"CUDNN_PARAM_YSTATS_DESC", {"HIPDNN_PARAM_YSTATS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16
{"CUDNN_PARAM_YSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17
{"CUDNN_PARAM_YSQSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSQSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18
{"CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", {"HIPDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19
{"CUDNN_PARAM_BN_SCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_SCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20
{"CUDNN_PARAM_BN_BIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_BIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21
{"CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22
{"CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23
{"CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 24
{"CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 25
{"CUDNN_PARAM_ZDESC", {"HIPDNN_PARAM_ZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 26
{"CUDNN_PARAM_ZDATA_PLACEHOLDER", {"HIPDNN_PARAM_ZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 27
{"CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 28
{"CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 29
{"CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 30
{"CUDNN_PARAM_ACTIVATION_BITMASK_DESC", {"HIPDNN_PARAM_ACTIVATION_BITMASK_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 31
{"CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", {"HIPDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 32
{"CUDNN_PARAM_DXDESC", {"HIPDNN_PARAM_DXDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 33
{"CUDNN_PARAM_DXDATA_PLACEHOLDER", {"HIPDNN_PARAM_DXDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 34
{"CUDNN_PARAM_DZDESC", {"HIPDNN_PARAM_DZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 35
{"CUDNN_PARAM_DZDATA_PLACEHOLDER", {"HIPDNN_PARAM_DZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 36
{"CUDNN_PARAM_BN_DSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_DSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 37
{"CUDNN_PARAM_BN_DBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_DBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 38
{"cudnnFusedOpsPointerPlaceHolder_t", {"hipdnnActivationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_PTR_NULL", {"HIPDNN_ACTIVATION_SIGMOID", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_PTR_ELEM_ALIGNED", {"HIPDNN_ACTIVATION_RELU", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"CUDNN_PTR_16B_ALIGNED", {"HIPDNN_ACTIVATION_TANH", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
{"cudnnFusedOpsVariantParamLabel_t", {"hipdnnFusedOpsVariantParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"CUDNN_PTR_XDATA", {"HIPDNN_PTR_XDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
{"CUDNN_PTR_BN_EQSCALE", {"HIPDNN_PTR_BN_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
{"CUDNN_PTR_BN_EQBIAS", {"HIPDNN_PTR_BN_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
{"CUDNN_PTR_WDATA", {"HIPDNN_PTR_WDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
{"CUDNN_PTR_DWDATA", {"HIPDNN_PTR_DWDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
{"CUDNN_PTR_YDATA", {"HIPDNN_PTR_YDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
{"CUDNN_PTR_DYDATA", {"HIPDNN_PTR_DYDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
{"CUDNN_PTR_YSUM", {"HIPDNN_PTR_YSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7
{"CUDNN_PTR_YSQSUM", {"HIPDNN_PTR_YSQSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8
{"CUDNN_PTR_WORKSPACE", {"HIPDNN_PTR_WORKSPACE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9
{"CUDNN_PTR_BN_SCALE", {"HIPDNN_PTR_BN_SCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10
{"CUDNN_PTR_BN_BIAS", {"HIPDNN_PTR_BN_BIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11
{"CUDNN_PTR_BN_SAVED_MEAN", {"HIPDNN_PTR_BN_SAVED_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12
{"CUDNN_PTR_BN_SAVED_INVSTD", {"HIPDNN_PTR_BN_SAVED_INVSTD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13
{"CUDNN_PTR_BN_RUNNING_MEAN", {"HIPDNN_PTR_BN_RUNNING_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14
{"CUDNN_PTR_BN_RUNNING_VAR", {"HIPDNN_PTR_BN_RUNNING_VAR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15
{"CUDNN_PTR_ZDATA", {"HIPDNN_PTR_ZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16
{"CUDNN_PTR_BN_Z_EQSCALE", {"HIPDNN_PTR_BN_Z_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17
{"CUDNN_PTR_BN_Z_EQBIAS", {"HIPDNN_PTR_BN_Z_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18
{"CUDNN_PTR_ACTIVATION_BITMASK", {"HIPDNN_PTR_ACTIVATION_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19
{"CUDNN_PTR_DXDATA", {"HIPDNN_PTR_DXDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20
{"CUDNN_PTR_DZDATA", {"HIPDNN_PTR_DZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21
{"CUDNN_PTR_BN_DSCALE", {"HIPDNN_PTR_BN_DSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22
{"CUDNN_PTR_BN_DBIAS", {"HIPDNN_PTR_BN_DBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23
{"CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", {"HIPDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 100
{"CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", {"HIPDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 101
{"CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", {"HIPDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 102
{"CUDNN_SCALAR_DOUBLE_BN_EPSILON", {"HIPDNN_SCALAR_DOUBLE_BN_EPSILON", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 103
// cuDNN types
{"cudnnContext", {"hipdnnContext", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
@@ -288,5 +375,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_TYPE_NAME_MAP{
{"cudnnSeqDataDescriptor_t", {"hipdnnSeqDataDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnAttnStruct", {"hipdnnAttnStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnAttnDescriptor_t", {"hipdnnAttnDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsConstParamStruct", {"hipdnnFusedOpsConstParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsConstParamPack_t", {"hipdnnFusedOpsConstParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsVariantParamStruct", {"hipdnnFusedOpsVariantParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsVariantParamPack_t", {"hipdnnFusedOpsVariantParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsPlanStruct", {"hipdnnFusedOpsPlanStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
{"cudnnFusedOpsPlan_t", {"hipdnnFusedOpsPlan_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
};
+2 -2
Просмотреть файл
@@ -175,8 +175,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{
{"cuMemcpy", {"hipMemcpy_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
// no analogue
// NOTE: Not equal to cudaMemcpy2D due to different signatures
{"cuMemcpy2D", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
{"cuMemcpy2D_v2", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
{"cuMemcpy2D", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}},
{"cuMemcpy2D_v2", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}},
// no analogue
// NOTE: Not equal to cudaMemcpy2DAsync due to different signatures
{"cuMemcpy2DAsync", {"hipMemcpy2DAsync_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
+3
Просмотреть файл
@@ -189,6 +189,9 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
}
return false;
case CONV_INCLUDE:
if (hipToken.hipName.empty()) {
return true;
}
switch (hipToken.apiType) {
case API_RAND:
if (hipToken.hipName == "hiprand_kernel.h") {
+5 -1
Просмотреть файл
@@ -59,7 +59,11 @@ void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token *
#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8)
_pp.EnterTokenStream(start, len, false, DisableMacroExpansion);
#else
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
#if (LLVM_VERSION_MAJOR < 9)
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
#else
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion, false);
#endif
#endif
}
+8 -2
Просмотреть файл
@@ -1009,14 +1009,20 @@ void __syncthreads()
#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
/*
__smid returns the wave's assigned Compute Unit and Shader Engine.
The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
Note: the results vary over time.
SZ minus 1 since SIZE is 1-based.
*/
__device__
inline
unsigned __smid(void)
{
unsigned cu_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
unsigned se_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
/* Each shader engine has 16 CU */
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+14 -14
Просмотреть файл
@@ -80,22 +80,22 @@ typedef struct hipArray {
}hipArray;
typedef struct hip_Memcpy2D {
size_t height;
size_t widthInBytes;
hipArray* dstArray;
hipDeviceptr_t dstDevice;
void* dstHost;
hipMemoryType dstMemoryType;
size_t dstPitch;
size_t dstXInBytes;
size_t dstY;
hipArray* srcArray;
hipDeviceptr_t srcDevice;
const void* srcHost;
hipMemoryType srcMemoryType;
size_t srcPitch;
size_t srcXInBytes;
size_t srcY;
hipMemoryType srcMemoryType;
const void* srcHost;
hipDeviceptr_t srcDevice;
hipArray* srcArray;
size_t srcPitch;
size_t dstXInBytes;
size_t dstY;
hipMemoryType dstMemoryType;
void* dstHost;
hipDeviceptr_t dstDevice;
hipArray* dstArray;
size_t dstPitch;
size_t WidthInBytes;
size_t Height;
} hip_Memcpy2D;
+92 -128
Просмотреть файл
@@ -120,51 +120,102 @@ THE SOFTWARE.
ret.y = lhs.y * rhs; \
return ret; \
}
#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ComplexT, T) \
explicit __device__ __host__ ComplexT(T val) : x(val), y(val) {} \
__device__ __host__ ComplexT(T val1, T val2) : x(val1), y(val2) {}
#endif
struct hipFloatComplex {
#ifdef __cplusplus
public:
typedef float value_type;
__device__ __host__ hipFloatComplex() : x(0.0f), y(0.0f) {}
explicit __device__ __host__ hipFloatComplex(float x) : x(x), y(0.0f) {}
__device__ __host__ hipFloatComplex(float x, float y) : x(x), y(y) {}
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned short)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed short)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned int)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed int)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, double)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long long)
#endif
float x, y;
} __attribute__((aligned(8)));
typedef float2 hipFloatComplex;
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
hipFloatComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
hipFloatComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
return z.x * z.x + z.y * z.y;
}
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
float sqabs = hipCsqabsf(q);
hipFloatComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
typedef double2 hipDoubleComplex;
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
hipDoubleComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
hipDoubleComplex ret;
ret.x = z.x;
ret.y = z.y;
return ret;
}
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
return z.x * z.x + z.y * z.y;
}
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
double sqabs = hipCsqabs(q);
hipDoubleComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
struct hipDoubleComplex {
#ifdef __cplusplus
public:
typedef double value_type;
__device__ __host__ hipDoubleComplex() : x(0.0f), y(0.0f) {}
explicit __device__ __host__ hipDoubleComplex(double x) : x(x), y(0.0f) {}
__device__ __host__ hipDoubleComplex(double x, double y) : x(x), y(y) {}
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned short)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed short)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned int)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed int)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, float)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long long)
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long long)
#endif
double x, y;
} __attribute__((aligned(16)));
#if __cplusplus
@@ -214,93 +265,6 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
#endif
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
hipFloatComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
hipFloatComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
return z.x * z.x + z.y * z.y;
}
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
float sqabs = hipCsqabsf(q);
hipFloatComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
hipDoubleComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
hipDoubleComplex ret;
ret.x = z.x;
ret.y = z.y;
return ret;
}
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
return z.x * z.x + z.y * z.y;
}
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
double sqabs = hipCsqabs(q);
hipDoubleComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
typedef hipFloatComplex hipComplex;
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
#include <utility>
#endif
#if defined(__clang__) && (__clang_major__ > 5)
#if __HCC_OR_HIP_CLANG__
typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
struct __half_raw {
+124
Просмотреть файл
@@ -275,6 +275,15 @@ typedef struct dim3 {
#endif
} dim3;
typedef struct hipLaunchParams_t {
void* func; ///< Device function symbol
dim3 gridDim; ///< Grid dimentions
dim3 blockDim; ///< Block dimentions
void **args; ///< Arguments
size_t sharedMem; ///< Shared memory
hipStream_t stream; ///< Stream identifier
} hipLaunchParams;
// Doxygen end group GlobalDefs
/** @} */
@@ -1858,6 +1867,16 @@ hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc*
*/
hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
size_t height, hipMemcpyKind kind);
/**
* @brief Copies memory for 2D arrays.
* @param[in] pCopy Parameters for the memory copy
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
* #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
*
* @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
* hipMemcpyToSymbol, hipMemcpyAsync
*/
hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
/**
@@ -2832,6 +2851,77 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
unsigned int sharedMemBytes, hipStream_t stream,
void** kernelParams, void** extra);
/**
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
* to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
*
* @param [in] f Kernel to launch.
* @param [in] gridDim Grid dimensions specified as multiple of blockDim.
* @param [in] blockDim Block dimensions specified in work-items
* @param [in] kernelParams A list of kernel arguments
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
* kernel can access this with HIP_DYNAMIC_SHARED.
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
* default stream is used with associated synchronization rules.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*/
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
void** kernelParams, unsigned int sharedMemBytes,
hipStream_t stream);
/**
* @brief Launches kernels on multiple devices where thread blocks can cooperate and
* synchronize as they execute.
*
* @param [in] hipLaunchParams List of launch parameters, one per device.
* @param [in] numDevices Size of the launchParamsList array.
* @param [in] flags Flags to control launch behavior.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*/
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags);
/**
* @brief Returns occupancy for a device function.
*
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);
/**
* @brief Returns occupancy for a device function.
*
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);
/**
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
* on respective streams before enqueuing any other work on the specified streams from any other threads
*
*
* @param [in] hipLaunchParams List of launch parameters, one per device.
* @param [in] numDevices Size of the launchParamsList array.
* @param [in] flags Flags to control launch behavior.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*/
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags);
// doxygen end Version Management
/**
* @}
@@ -3160,6 +3250,40 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
return hipSuccess;
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize);
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize, flags);
}
template <class T>
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim,
blockDim, kernelParams, sharedMemBytes, stream);
}
template <class T>
inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
unsigned int numDevices, unsigned int flags = 0) {
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
}
template <class T>
inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
unsigned int numDevices, unsigned int flags = 0) {
return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
}
/*
* @brief Unbinds the textuer bound to @p tex
*
+6 -1
Просмотреть файл
@@ -45,10 +45,15 @@ THE SOFTWARE.
* *
* *
*******************************************************************************/
#if __HIP__
#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
#else
#define __HIP_TEXTURE_ATTRIB
#endif
template <class T, int texType = hipTextureType1D,
enum hipTextureReadMode mode = hipReadModeElementType>
struct texture : public textureReference {
struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
enum hipTextureAddressMode aMode = hipAddressModeClamp) {
normalized = norm;
+4
Просмотреть файл
@@ -115,6 +115,8 @@ typedef struct hipDeviceProp_t {
int canMapHostMemory; ///< Check whether HIP can map host memory
int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701
int integrated; ///< APU vs dGPU
int cooperativeLaunch; ///< HIP device supports cooperative launch
int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices
} hipDeviceProp_t;
@@ -291,6 +293,8 @@ typedef enum hipDeviceAttribute_t {
///< Multiprocessor.
hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices.
hipDeviceAttributeIntegrated, ///< iGPU
hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
} hipDeviceAttribute_t;
enum hipComputeMode {
+5
Просмотреть файл
@@ -162,6 +162,7 @@ typedef CUdeviceptr hipDeviceptr_t;
typedef struct cudaArray hipArray;
typedef struct cudaArray* hipArray_const_t;
typedef cudaFuncAttributes hipFuncAttributes;
#define hip_Memcpy2D CUDA_MEMCPY2D
#define hipMemcpy3DParms cudaMemcpy3DParms
#define hipArrayDefault cudaArrayDefault
#define hipArrayLayered cudaArrayLayered
@@ -578,6 +579,10 @@ inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src,
cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
}
inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
return hipCUResultTohipError(cuMemcpy2D(pCopy));
}
inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
{
return hipCUDAErrorTohipError(cudaMemcpy3D(p));
+3 -1
Просмотреть файл
@@ -3,7 +3,9 @@ project(hip_hcc)
install(FILES @PROJECT_BINARY_DIR@/libhip_hcc.so DESTINATION lib)
install(FILES @PROJECT_BINARY_DIR@/libhip_hcc_static.a DESTINATION lib)
install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib)
if(NOT @HIP_COMPILER@ STREQUAL "clang")
install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib)
endif()
install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib)
install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip)
install(FILES @hip_SOURCE_DIR@/packaging/hip-targets.cmake @hip_SOURCE_DIR@/packaging/hip-targets-release.cmake DESTINATION lib/cmake/hip)
+2 -2
Просмотреть файл
@@ -71,8 +71,8 @@ bool runTest(int argc, char** argv) {
copyParam.srcMemoryType = hipMemoryTypeHost;
copyParam.srcHost = hData;
copyParam.srcPitch = width * sizeof(float);
copyParam.widthInBytes = copyParam.srcPitch;
copyParam.height = height;
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
hipMemcpyParam2D(&copyParam);
textureReference* texref;
+2 -2
Просмотреть файл
@@ -1715,8 +1715,8 @@ hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
if (pCopy == nullptr) {
e = hipErrorInvalidValue;
}
e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch,
pCopy->widthInBytes, pCopy->height, hipMemcpyDefault);
e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->WidthInBytes, pCopy->srcHost, pCopy->srcPitch,
pCopy->WidthInBytes, pCopy->Height, hipMemcpyDefault);
return ihipLogStatus(e);
}
+64 -20
Просмотреть файл
@@ -59,6 +59,18 @@ using namespace std;
static const size_t HIP_IMPLICIT_KERNARG_SIZE = 48;
static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8;
struct amd_kernel_code_v3_t {
uint32_t group_segment_fixed_size;
uint32_t private_segment_fixed_size;
uint8_t reserved0[8];
int64_t kernel_code_entry_byte_offset;
uint8_t reserved1[24];
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
};
// calculate MD5 checksum
inline std::string checksum(size_t size, const char *source) {
// FNV-1a hashing, 64-bit version
@@ -191,7 +203,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
lp.dynamic_group_mem_bytes =
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
hStream = ihipPreLaunchKernel(
hStream, dim3(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ),
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str());
@@ -208,10 +220,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
aql.group_segment_size =
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
aql.private_segment_size =
f->_header->workitem_private_segment_byte_size;
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
const auto* header =
reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
aql.group_segment_size =
header->group_segment_fixed_size + sharedMemBytes;
aql.private_segment_size =
header->private_segment_fixed_size;
} else {
aql.group_segment_size =
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
aql.private_segment_size =
f->_header->workitem_private_segment_byte_size;
}
aql.kernel_object = f->_object;
aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
aql.header =
@@ -464,6 +486,12 @@ hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const ch
auto kernel = find_kernel_by_name(hmod->executable, name, agent);
if (kernel.handle == 0u) {
std::string name_str(name);
name_str.append(".kd");
kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent);
}
if (kernel.handle == 0u) return hipErrorNotFound;
// TODO: refactor the whole ihipThisThat, which is a mess and yields the
@@ -488,7 +516,11 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
}
namespace {
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}
hipFuncAttributes make_function_attributes(const ihipModuleSymbol_t& kd) {
hipFuncAttributes r{};
hipDeviceProp_t prop{};
@@ -497,16 +529,31 @@ hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
r.localSizeBytes = header.workitem_private_segment_byte_size;
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.numRegs = header.workitem_vgpr_count;
if (is_code_object_v3) {
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
} else {
r.numRegs = kd._header->workitem_vgpr_count;
}
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
r.binaryVersion =
header.amd_machine_version_major * 10 +
header.amd_machine_version_minor;
if (is_code_object_v3) {
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
} else {
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
@@ -522,11 +569,10 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
auto agent = this_agent();
auto kd = get_program_state().kernel_descriptor(reinterpret_cast<uintptr_t>(func), agent);
const auto header = kd->_header;
if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."};
if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."};
*attr = make_function_attributes(*header);
*attr = make_function_attributes(*kd);
return hipSuccess;
}
@@ -557,11 +603,9 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
(*module)->executable = get_program_state().load_executable(
content.data(), content.size(), (*module)->executable,
this_agent());
istringstream elf{content};
ELFIO::elfio reader;
if (reader.load(elf)) {
program_state_impl::read_kernarg_metadata(reader, (*module)->kernargs);
}
std::vector<char> blob(content.cbegin(), content.cend());
program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
// compute the hash of the code object
(*module)->hash = checksum(content.length(), content.data());
+159 -77
Просмотреть файл
@@ -17,6 +17,7 @@
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_loader.h>
#include <amd_comgr.h>
#include <link.h>
@@ -540,9 +541,13 @@ public:
std::call_once(functions[agent].first, [this](hsa_agent_t aa) {
for (auto&& function : get_function_names()) {
const auto it = get_kernels(aa).find(function.second);
auto it = get_kernels(aa).find(function.second);
if (it == get_kernels(aa).cend()) continue;
if (it == get_kernels(aa).cend()) {
it = get_kernels(aa).find(function.second + ".kd");
if (it == get_kernels(aa).cend())
continue;
}
for (auto&& kernel_symbol : it->second) {
functions[aa].second.emplace(
@@ -556,92 +561,172 @@ public:
}
static
std::size_t parse_args(
const std::string& metadata,
std::size_t f,
std::size_t l,
std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
std::string str;
size_t size;
if (amd_comgr_get_metadata_string(md, &size, NULL)
== AMD_COMGR_STATUS_SUCCESS) {
str.resize(size - 1);
amd_comgr_get_metadata_string(md, &size, &str[0]);
}
return str;
}
static
void parse_args(
const amd_comgr_metadata_node_t& args_md,
bool is_code_object_v3,
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
if (f == l) return f;
if (!size_align.empty()) return l;
size_t arg_count = 0;
if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
!= AMD_COMGR_STATUS_SUCCESS)
return;
do {
static constexpr size_t size_sz{5};
f = metadata.find("Size:", f) + size_sz;
for (size_t i = 0; i < arg_count; ++i) {
amd_comgr_metadata_node_t arg_md;
if (l <= f) return f;
if (amd_comgr_index_list_metadata(args_md, i, &arg_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
auto size = std::strtoul(&metadata[f], nullptr, 10);
amd_comgr_metadata_node_t arg_size_md;
if (amd_comgr_metadata_lookup(arg_md,
is_code_object_v3 ? ".size" : "Size",
&arg_size_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
static constexpr size_t align_sz{6};
f = metadata.find("Align:", f) + align_sz;
size_t arg_size = std::stoul(metadata_to_string(arg_size_md));
char* l{};
auto align = std::strtoul(&metadata[f], &l, 10);
if (amd_comgr_destroy_metadata(arg_size_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
f += (l - &metadata[f]) + 1;
size_t arg_align;
size_align.emplace_back(size, align);
} while (true);
if (is_code_object_v3) {
amd_comgr_metadata_node_t arg_offset_md;
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
size_t arg_offset
= std::stoul(metadata_to_string(arg_offset_md));
if (amd_comgr_destroy_metadata(arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
arg_align = 1;
while (arg_offset && (arg_offset & 1) == 0) {
arg_offset >>= 1;
arg_align <<= 1;
}
} else {
amd_comgr_metadata_node_t arg_align_md;
if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
arg_align = std::stoul(metadata_to_string(arg_align_md));
if (amd_comgr_destroy_metadata(arg_align_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
}
size_align.emplace_back(arg_size, arg_align);
if (amd_comgr_destroy_metadata(arg_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
}
}
static
void read_kernarg_metadata(
ELFIO::elfio& reader,
const std::vector<char>& blob,
std::unordered_map<
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
// TODO: this is inefficient.
auto it = find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_NOTE;
});
amd_comgr_data_t dataIn;
amd_comgr_status_t status;
if (!it) return;
if (amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &dataIn)
!= AMD_COMGR_STATUS_SUCCESS)
return;
const ELFIO::note_section_accessor acc{reader, it};
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
ELFIO::Elf_Word type{};
std::string name{};
void* desc{};
ELFIO::Elf_Word desc_size{};
if (amd_comgr_set_data(dataIn, blob.size(), blob.data())
!= AMD_COMGR_STATUS_SUCCESS)
return;
acc.get_note(i, type, name, desc, desc_size);
amd_comgr_metadata_node_t metadata;
if (amd_comgr_get_data_metadata(dataIn, &metadata)
!= AMD_COMGR_STATUS_SUCCESS)
return;
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
std::string tmp{
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
auto dx = tmp.find("Kernels:");
if (dx == std::string::npos) continue;
static constexpr decltype(tmp.size()) kernels_sz{8};
dx += kernels_sz;
do {
dx = tmp.find("Name:", dx);
if (dx == std::string::npos) break;
static constexpr decltype(tmp.size()) name_sz{5};
dx = tmp.find_first_not_of(" '", dx + name_sz);
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
dx += fn.size();
auto dx1 = tmp.find("CodeProps", dx);
dx = tmp.find("Args:", dx);
if (dx1 < dx) {
dx = dx1;
continue;
}
if (dx == std::string::npos) break;
static constexpr decltype(tmp.size()) args_sz{5};
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
} while (true);
bool is_code_object_v3 = false;
amd_comgr_metadata_node_t kernels_md;
if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
!= AMD_COMGR_STATUS_SUCCESS) {
if (amd_comgr_metadata_lookup(metadata,
"amdhsa.kernels",
&kernels_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
is_code_object_v3 = true;
}
size_t kernel_count = 0;
if (amd_comgr_get_metadata_list_size(kernels_md, &kernel_count)
!= AMD_COMGR_STATUS_SUCCESS)
return;
for (size_t i = 0; i < kernel_count; i++) {
amd_comgr_metadata_node_t kernel_md;
if (amd_comgr_index_list_metadata(kernels_md, i, &kernel_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
amd_comgr_metadata_node_t name_md;
if (amd_comgr_metadata_lookup(kernel_md,
is_code_object_v3 ? ".name" : "Name",
&name_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
std::string kernel_name_str = metadata_to_string(name_md);
if (amd_comgr_destroy_metadata(name_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
if (is_code_object_v3)
kernel_name_str.append(".kd");
amd_comgr_metadata_node_t args_md;
if (amd_comgr_metadata_lookup(kernel_md,
is_code_object_v3 ? ".args" : "Args",
&args_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
|| amd_comgr_destroy_metadata(kernel_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
}
if (amd_comgr_destroy_metadata(kernels_md) != AMD_COMGR_STATUS_SUCCESS
|| amd_comgr_destroy_metadata(metadata) != AMD_COMGR_STATUS_SUCCESS)
return;
amd_comgr_release_data(dataIn);
}
const std::unordered_map<std::string,
@@ -651,13 +736,7 @@ public:
for (auto&& name_and_isa_blobs : get_code_object_blobs()) {
for (auto&& isa_blobs : name_and_isa_blobs.second) {
for (auto&& blob : isa_blobs.second) {
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
ELFIO::elfio reader;
if (!reader.load(tmp)) continue;
read_kernarg_metadata(reader, kernargs.second);
read_kernarg_metadata(blob, kernargs.second);
}
}
}
@@ -711,8 +790,11 @@ public:
auto it1 = get_kernargs().find(it->second);
if (it1 == get_kernargs().end()) {
hip_throw(std::runtime_error{
"Missing metadata for __global__ function: " + it->second});
it1 = get_kernargs().find(it->second + ".kd");
if (it1 == get_kernargs().end()) {
hip_throw(std::runtime_error{
"Missing metadata for __global__ function: " + it->second});
}
}
return it1->second;
+36
Просмотреть файл
@@ -0,0 +1,36 @@
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
/*
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// CHECK: #include <hip/hip_runtime.h>
#include <cuda.h>
// CHECK-NOT: #include <texture_fetch_functions.h>
#include <texture_fetch_functions.h>
// CHECK: extern texture<float, 2, hipReadModeElementType> tex;
extern texture<float, 2, cudaReadModeElementType> tex;
extern "C" __global__ void tex2dKernel(float* outputData, int width, int height) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
outputData[y * width + x] = tex2D(tex, x, y);
}
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
* HIT_END
*/
#include <test_common.h>
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
* HIT_END
*/
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
* HIT_END
*/
Двоичный файл не отображается.
+2
Просмотреть файл
@@ -178,7 +178,9 @@ void runTests(int64_t numElements) {
{
test(0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/);
test(0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/);
#ifndef __HIP_CLANG_ONLY__
test(0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/);
#endif
test(0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/);
// Sending a marker to to null stream may synchronize the otherStream