diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index ba855ad86b..8701fe5635 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -271,6 +271,21 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hiprtc PUBLIC stdc++fs) endif() + + if(HIP_PLATFORM STREQUAL "hcc") + find_package(amd_comgr REQUIRED CONFIG + PATHS + /opt/rocm/ + PATH_SUFFIXES + cmake/amd_comgr + lib/cmake/amd_comgr + ) + MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") + endif() + + target_link_libraries(hip_hcc PRIVATE amd_comgr) + target_link_libraries(hip_hcc_static PRIVATE amd_comgr) + string(REPLACE " " ";" HCC_CXX_FLAGS_LIST ${HCC_CXX_FLAGS}) foreach(TARGET hip_hcc hip_hcc_static) target_include_directories(${TARGET} SYSTEM INTERFACE $/include>;${HSA_PATH}/include) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index fadc74c137..58c8fe45c9 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -107,24 +107,25 @@ $HIP_RUNTIME= $hipConfig{'HIP_RUNTIME'}; # If using VDI runtime, need to find HIP_VDI_HOME if ($HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) { - $HIP_VDI_HOME = "/opt/rocm/hip" + my $hipcc_dir = dirname($0); + if (-e "$hipcc_dir/../lib/bitcode") { + $HIP_VDI_HOME = abs_path($hipcc_dir . "/.."); + } else { + $HIP_VDI_HOME = "/opt/rocm/hip"; + } } if (defined $HIP_VDI_HOME) { - my $bits = ""; - if (-d "$HIP_VDI_HOME/bin/x86_64") { - $bits = "/x86_64"; + if (!defined $HIP_CLANG_PATH and (-e "$HIP_VDI_HOME/bin/clang" or -e "$HIP_VDI_HOME/bin/clang.exe")) { + $HIP_CLANG_PATH = "$HIP_VDI_HOME/bin"; + $HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang"; } - if (!defined $HIP_CLANG_PATH) { - $HIP_CLANG_PATH = "$HIP_VDI_HOME/bin" . $bits; + if (!defined $DEVICE_LIB_PATH and -e "$HIP_VDI_HOME/lib/bitcode") { + $DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib/bitcode"; } - if (!defined $DEVICE_LIB_PATH) { - $DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits . "/bitcode"; - } - $HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang"; $HIP_INCLUDE_PATH = "$HIP_VDI_HOME/include"; if (!defined $HIP_LIB_PATH) { - $HIP_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits; + $HIP_LIB_PATH = "$HIP_VDI_HOME/lib"; } } @@ -169,7 +170,7 @@ if ($HIP_PLATFORM eq "clang") { $HIP_CLANG_VERSION=$1; if (! defined $HIP_CLANG_INCLUDE_PATH) { - $HIP_CLANG_INCLUDE_PATH = "$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include"; + $HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include"); } if (! defined $HIP_INCLUDE_PATH) { $HIP_INCLUDE_PATH = "$HIP_PATH/include"; @@ -370,7 +371,7 @@ if($HIP_PLATFORM eq "nvcc"){ my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool my $optArg = ""; # -O args -my $rdc = 0; +my $gArg = ""; # -g args foreach $arg (@ARGV) { @@ -466,13 +467,9 @@ foreach $arg (@ARGV) { $optArg = $arg; } - if($arg =~ /-fgpu-rdc/) + if($arg =~ m/^-g/) { - $rdc = 1; - } - if($arg =~ /-fno-gpu-rdc/) - { - $rdc = 0; + $gArg = $arg; } ## process linker response file for hip-clang @@ -799,16 +796,12 @@ if ($needHipHcc) { if ($HIP_PLATFORM eq "clang") { # Set default optimization level to -O3 for hip-clang. - if ($optArg eq "") { + if ($optArg eq "" and $gArg ne "-g") { $HIPCXXFLAGS .= " -O3"; $HIPLDFLAGS .= " -O3"; } $HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH"; - if ($rdc eq 0) { - $HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS"; - } else { - $HIPLDFLAGS .= " $HIP_DEVLIB_FLAGS"; - } + $HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS"; if ($isWindows) { $HIPCXXFLAGS .= " -std=c++14 -fms-extensions -fms-compatibility"; } else { @@ -848,7 +841,21 @@ if ($runCmd) { print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using $HCC_HOME/hcc with version $HCC_VERSION from hipcc. Please rebuild HIP including cmake or update HCC_HOME variable.\n") ; die unless $ENV{'HIP_IGNORE_HCC_VERSION'}; } - system ("$CMD") or delete_temp_dirs () and die (); + system ("$CMD"); + if ($? == -1) { + print "failed to execute: $!\n"; + exit($?); + } + elsif ($? & 127) { + printf "child died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit($?); + } + else { + $CMD_EXIT_CODE = $? >> 8; + } + $? or delete_temp_dirs (); + exit($CMD_EXIT_CODE); } # vim: ts=4:sw=4:expandtab:smartindent diff --git a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index aeb834c2ec..7c2d2c6631 100644 --- a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -861,7 +861,7 @@ | `cuMemAllocManaged` | | | `cuMemAllocPitch` | | | `cuMemcpy` | | -| `cuMemcpy2D` | | +| `cuMemcpy2D` | `hipMemcpyParam2D` | | `cuMemcpy2DAsync` | | | `cuMemcpy2DUnaligned` | | | `cuMemcpy3D` | | diff --git a/hipamd/docs/markdown/CUDNN_API_supported_by_HIP.md b/hipamd/docs/markdown/CUDNN_API_supported_by_HIP.md index 2a3fa1a67a..3b59de4195 100644 --- a/hipamd/docs/markdown/CUDNN_API_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDNN_API_supported_by_HIP.md @@ -246,6 +246,99 @@ | enum |***`cudnnWgradMode_t`*** | | | 0 |*`CUDNN_WGRAD_MODE_ADD`* | | | 1 |*`CUDNN_WGRAD_MODE_SET`* | | +| enum |***`cudnnReorderType_t`*** | | +| 0 |*`CUDNN_DEFAULT_REORDER`* | | +| 1 |*`CUDNN_NO_REORDER`* | | +| enum |***`cudnnLossNormalizationMode_t`*** | | +| 0 |*`CUDNN_LOSS_NORMALIZATION_NONE`* | | +| 1 |*`CUDNN_LOSS_NORMALIZATION_SOFTMAX`* | | +| struct |`cudnnFusedOpsConstParamStruct` | | +| struct* |`cudnnFusedOpsConstParamPack_t` | | +| struct |`cudnnFusedOpsVariantParamStruct` | | +| struct* |`cudnnFusedOpsVariantParamPack_t` | | +| struct |`cudnnFusedOpsPlanStruct` | | +| struct* |`cudnnFusedOpsPlan_t` | | +| enum |***`cudnnFusedOps_t`*** | | +| 0 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS`* | | +| 1 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD`* | | +| 2 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING`* | | +| 3 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE`* | | +| 4 |*`CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION`* | | +| 5 |*`CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK`* | | +| 6 |*`CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM`* | | +| enum |***`cudnnFusedOpsConstParamLabel_t`*** | | +| 0 |*`CUDNN_PARAM_XDESC`* | | +| 1 |*`CUDNN_PARAM_XDATA_PLACEHOLDER`* | | +| 2 |*`CUDNN_PARAM_BN_MODE`* | | +| 3 |*`CUDNN_PARAM_BN_EQSCALEBIAS_DESC`* | | +| 4 |*`CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER`* | | +| 5 |*`CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER`* | | +| 6 |*`CUDNN_PARAM_ACTIVATION_DESC`* | | +| 7 |*`CUDNN_PARAM_CONV_DESC`* | | +| 8 |*`CUDNN_PARAM_WDESC`* | | +| 9 |*`CUDNN_PARAM_WDATA_PLACEHOLDER`* | | +| 10 |*`CUDNN_PARAM_DWDESC`* | | +| 11 |*`CUDNN_PARAM_DWDATA_PLACEHOLDER`* | | +| 12 |*`CUDNN_PARAM_YDESC`* | | +| 13 |*`CUDNN_PARAM_YDATA_PLACEHOLDER`* | | +| 14 |*`CUDNN_PARAM_DYDESC`* | | +| 15 |*`CUDNN_PARAM_DYDATA_PLACEHOLDER`* | | +| 16 |*`CUDNN_PARAM_YSTATS_DESC`* | | +| 17 |*`CUDNN_PARAM_YSUM_PLACEHOLDER`* | | +| 18 |*`CUDNN_PARAM_YSQSUM_PLACEHOLDER`* | | +| 19 |*`CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC`* | | +| 20 |*`CUDNN_PARAM_BN_SCALE_PLACEHOLDER`* | | +| 21 |*`CUDNN_PARAM_BN_BIAS_PLACEHOLDER`* | | +| 22 |*`CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER`* | | +| 23 |*`CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER`* | | +| 24 |*`CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER`* | | +| 25 |*`CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER`* | | +| 26 |*`CUDNN_PARAM_ZDESC`* | | +| 27 |*`CUDNN_PARAM_ZDATA_PLACEHOLDER`* | | +| 28 |*`CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC`* | | +| 29 |*`CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER`* | | +| 30 |*`CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER`* | | +| 31 |*`CUDNN_PARAM_ACTIVATION_BITMASK_DESC`* | | +| 32 |*`CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER`* | | +| 33 |*`CUDNN_PARAM_DXDESC`* | | +| 34 |*`CUDNN_PARAM_DXDATA_PLACEHOLDER`* | | +| 35 |*`CUDNN_PARAM_DZDESC`* | | +| 36 |*`CUDNN_PARAM_DZDATA_PLACEHOLDER`* | | +| 37 |*`CUDNN_PARAM_BN_DSCALE_PLACEHOLDER`* | | +| 38 |*`CUDNN_PARAM_BN_DBIAS_PLACEHOLDER`* | | +| enum |***`cudnnFusedOpsPointerPlaceHolder_t`*** | | +| 0 |*`CUDNN_PTR_NULL`* | | +| 1 |*`CUDNN_PTR_ELEM_ALIGNED`* | | +| 2 |*`CUDNN_PTR_16B_ALIGNED`* | | +| enum |***`cudnnFusedOpsVariantParamLabel_t`*** | | +| 0 |*`CUDNN_PTR_XDATA`* | | +| 1 |*`CUDNN_PTR_BN_EQSCALE`* | | +| 2 |*`CUDNN_PTR_BN_EQBIAS`* | | +| 3 |*`CUDNN_PTR_WDATA`* | | +| 4 |*`CUDNN_PTR_DWDATA`* | | +| 5 |*`CUDNN_PTR_YDATA`* | | +| 6 |*`CUDNN_PTR_DYDATA`* | | +| 7 |*`CUDNN_PTR_YSUM`* | | +| 8 |*`CUDNN_PTR_YSQSUM`* | | +| 9 |*`CUDNN_PTR_WORKSPACE`* | | +| 10 |*`CUDNN_PTR_BN_SCALE`* | | +| 11 |*`CUDNN_PTR_BN_BIAS`* | | +| 12 |*`CUDNN_PTR_BN_SAVED_MEAN`* | | +| 13 |*`CUDNN_PTR_BN_SAVED_INVSTD`* | | +| 14 |*`CUDNN_PTR_BN_RUNNING_MEAN`* | | +| 15 |*`CUDNN_PTR_BN_RUNNING_VAR`* | | +| 16 |*`CUDNN_PTR_ZDATA`* | | +| 17 |*`CUDNN_PTR_BN_Z_EQSCALE`* | | +| 18 |*`CUDNN_PTR_BN_Z_EQBIAS`* | | +| 19 |*`CUDNN_PTR_ACTIVATION_BITMASK`* | | +| 20 |*`CUDNN_PTR_DXDATA`* | | +| 21 |*`CUDNN_PTR_DZDATA`* | | +| 22 |*`CUDNN_PTR_BN_DSCALE`* | | +| 23 |*`CUDNN_PTR_BN_DBIAS`* | | +| 100 |*`CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES`* | | +| 101 |*`CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT`* | | +| 102 |*`CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR`* | | +| 103 |*`CUDNN_SCALAR_DOUBLE_BN_EPSILON`* | | ## **2. CUDNN API functions** @@ -282,6 +375,7 @@ |`cudnnGetOpTensorDescriptor` |`hipdnnGetOpTensorDescriptor` | |`cudnnDestroyOpTensorDescriptor` |`hipdnnDestroyOpTensorDescriptor` | |`cudnnOpTensor` |`hipdnnOpTensor` | +|`cudnnGetFoldedConvBackwardDataDescriptors` | | |`cudnnCreateReduceTensorDescriptor` |`hipdnnCreateReduceTensorDescriptor` | |`cudnnSetReduceTensorDescriptor` |`hipdnnSetReduceTensorDescriptor` | |`cudnnGetReduceTensorDescriptor` |`hipdnnGetReduceTensorDescriptor` | @@ -296,12 +390,17 @@ |`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` | |`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` | |`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` | +|`cudnnGetFilterSizeInBytes` | | +|`cudnnTransformFilter` | | |`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` | +|`cudnnReorderFilterAndBias` | | |`cudnnCreateConvolutionDescriptor` |`hipdnnCreateConvolutionDescriptor` | |`cudnnSetConvolutionMathType` |`hipdnnSetConvolutionMathType` | |`cudnnGetConvolutionMathType` | | |`cudnnSetConvolutionGroupCount` |`hipdnnSetConvolutionGroupCount` | |`cudnnGetConvolutionGroupCount` | | +|`cudnnSetConvolutionReorderType` | | +|`cudnnGetConvolutionReorderType` | | |`cudnnSetConvolution2dDescriptor` |`hipdnnSetConvolution2dDescriptor` | |`cudnnGetConvolution2dDescriptor` |`hipdnnGetConvolution2dDescriptor` | |`cudnnGetConvolution2dForwardOutputDim` |`hipdnnGetConvolution2dForwardOutputDim` | @@ -424,7 +523,9 @@ |`cudnnGetRNNBiasMode` | | |`cudnnCreateCTCLossDescriptor` | | |`cudnnSetCTCLossDescriptor` | | +|`cudnnSetCTCLossDescriptorEx` | | |`cudnnGetCTCLossDescriptor` | | +|`cudnnGetCTCLossDescriptorEx` | | |`cudnnDestroyCTCLossDescriptor` | | |`cudnnCTCLoss` | | |`cudnnGetCTCLossWorkspaceSize` | | @@ -462,3 +563,15 @@ |`cudnnMultiHeadAttnForward` | | |`cudnnMultiHeadAttnBackwardData` | | |`cudnnMultiHeadAttnBackwardWeights` | | +|`cudnnCreateFusedOpsConstParamPack` | | +|`cudnnDestroyFusedOpsConstParamPack` | | +|`cudnnSetFusedOpsConstParamPackAttribute` | | +|`cudnnGetFusedOpsConstParamPackAttribute` | | +|`cudnnCreateFusedOpsVariantParamPack` | | +|`cudnnDestroyFusedOpsVariantParamPack` | | +|`cudnnSetFusedOpsVariantParamPackAttribute` | | +|`cudnnGetFusedOpsVariantParamPackAttribute` | | +|`cudnnCreateFusedOpsPlan` | | +|`cudnnDestroyFusedOpsPlan` | | +|`cudnnMakeFusedOpsPlan` | | +|`cudnnFusedOpsExecute` | | diff --git a/hipamd/docs/markdown/hip_kernel_language.md b/hipamd/docs/markdown/hip_kernel_language.md index d69f5a04a8..5479813675 100644 --- a/hipamd/docs/markdown/hip_kernel_language.md +++ b/hipamd/docs/markdown/hip_kernel_language.md @@ -35,6 +35,9 @@ - [Warp Cross-Lane Functions](#warp-cross-lane-functions) * [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions) * [Warp Shuffle Functions](#warp-shuffle-functions) +- [Cooperative Groups Functions](#cooperative-groups-functions) +- [Warp Matrix Functions](#warp-matrix-functions) +- [Independent Thread Scheduling](#independent-thread-scheduling) - [Profiler Counter Function](#profiler-counter-function) - [Assert](#assert) - [Printf](#printf) @@ -599,6 +602,70 @@ float __shfl_xor (float var, int laneMask, int width=warpSize); ``` +## Cooperative Groups Functions + +Cooperative groups is a mechanism for forming and communicating between groups of threads at +a granularity different than the block. This feature was introduced in Cuda 9. + +HIP does not support any of the kernel language cooperative groups +types or functions. + + +| **Function** | **Supported in HIP** | **Supported in CUDA** | +| --- | --- | --- | +| `void thread_group.sync()` | | ✓ | +| `unsigned thread_group.size()` | | ✓ | +| `unsigned thread_group.thread_rank()` | | ✓ | +| `bool thread_group.is_valid()` | | ✓ | +| `thread_group tiled_partition(thread_group, size)` | | ✓ | +| `thread_block_tile tiled_partition(thread_group)` | | ✓ | +| `thread_block this_thread_block()` | | ✓ | +| `T thread_block_tile.shfl()` | | ✓ | +| `T thread_block_tile.shfl_down()` | | ✓ | +| `T thread_block_tile.shfl_up()` | | ✓ | +| `T thread_block_tile.shfl_xor()` | | ✓ | +| `T thread_block_tile.any()` | | ✓ | +| `T thread_block_tile.all()` | | ✓ | +| `T thread_block_tile.ballot()` | | ✓ | +| `T thread_block_tile.match_any()` | | ✓ | +| `T thread_block_tile.match_all()` | | ✓ | +| `coalesced_group coalesced_threads()` | | ✓ | +| `grid_group this_grid()` | | ✓ | +| `void grid_group.sync()` | | ✓ | +| `unsigned grid_group.size()` | | ✓ | +| `unsigned grid_group.thread_rank()` | | ✓ | +| `bool grid_group.is_valid()` | | ✓ | +| `multi_grid_group this_multi_grid()` | | ✓ | +| `void multi_grid_group.sync()` | | ✓ | +| `unsigned multi_grid_group.size()` | | ✓ | +| `unsigned multi_grid_group.thread_rank()` | | ✓ | +| `bool multi_grid_group.is_valid()` | | ✓ | + +## Warp Matrix Functions + +Warp matrix functions allow a warp to cooperatively operate on small matrices +whose elements are spread over the lanes in an unspecified manner. This feature +was introduced in Cuda 9. + +HIP does not support any of the kernel language warp matrix +types or functions. + +| **Function** | **Supported in HIP** | **Supported in CUDA** | +| --- | --- | --- | +| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)` | | ✓ | +| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout)` | | ✓ | +| `void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)` | | ✓ | +| `void fill_fragment(fragment<...> &a, const T &value)` | | ✓ | +| `void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c , bool sat)` | | ✓ | + +## Independent Thread Scheduling + +The hardware support for independent thread scheduling introduced in certain architectures +supporting Cuda allows threads to progress independently of each other and enables +intra-warp synchronizations that were previously not allowed. + +HIP does not support this type of scheduling. + ## Profiler Counter Function The Cuda `__prof_trigger()` instruction is not supported. diff --git a/hipamd/hipify-clang/README.md b/hipamd/hipify-clang/README.md index 9ec0a7b9e7..7744085f16 100644 --- a/hipamd/hipify-clang/README.md +++ b/hipamd/hipify-clang/README.md @@ -142,9 +142,9 @@ To run it: * Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option: - - Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10` + - Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64` - - Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10` + - Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64` 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed. @@ -178,9 +178,9 @@ To run it: On Linux the following configurations are tested: -Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.5.1.10 +Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.0.64 -Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.5.1.10 +Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.6.0.64 Build system for the above configurations: @@ -195,7 +195,7 @@ cmake -DCMAKE_INSTALL_PREFIX=../dist \ -DCMAKE_PREFIX_PATH=/srv/git/LLVM/8.0.0/dist \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 \ - -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10 \ + -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64 \ -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/8.0.0/build/bin/llvm-lit \ .. ``` @@ -311,9 +311,9 @@ On Windows 10 the following configurations are tested: LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18 -LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.5.1.10 +LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.0.64 -LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.5.1.10 +LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.6.0.64 Build system for the above configurations: @@ -330,7 +330,7 @@ cmake -DCMAKE_PREFIX_PATH=f:/LLVM/6.0.1/dist \ -DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0" \ -DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v9.0" \ - -DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10 \ + -DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64 \ -DLLVM_EXTERNAL_LIT=f:/LLVM/6.0.1/build/Release/bin/llvm-lit.py \ -Thost=x64 .. diff --git a/hipamd/hipify-clang/src/CUDA2HIP.cpp b/hipamd/hipify-clang/src/CUDA2HIP.cpp index 9a26a051ca..1e530745e6 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP.cpp @@ -33,6 +33,7 @@ const std::map CUDA_INCLUDE_MAP{ {"driver_types.h", {"hip/driver_types.h", "", CONV_INCLUDE, API_RUNTIME}}, {"cuda_fp16.h", {"hip/hip_fp16.h", "", CONV_INCLUDE, API_RUNTIME}}, {"cuda_texture_types.h", {"hip/hip_texture_types.h", "", CONV_INCLUDE, API_RUNTIME}}, + {"texture_fetch_functions.h", {"", "", CONV_INCLUDE, API_RUNTIME}}, {"vector_types.h", {"hip/hip_vector_types.h", "", CONV_INCLUDE, API_RUNTIME}}, {"cuda_profiler_api.h", {"hip/hip_profile.h", "", CONV_INCLUDE, API_RUNTIME}}, // cuComplex includes diff --git a/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_functions.cpp b/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_functions.cpp index a52c392b72..765ce78a26 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_functions.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_functions.cpp @@ -61,6 +61,7 @@ const std::map CUDA_DNN_FUNCTION_MAP{ {"cudnnGetOpTensorDescriptor", {"hipdnnGetOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnDestroyOpTensorDescriptor", {"hipdnnDestroyOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnOpTensor", {"hipdnnOpTensor", "", CONV_LIB_FUNC, API_DNN}}, + {"cudnnGetFoldedConvBackwardDataDescriptors", {"hipdnnGetFoldedConvBackwardDataDescriptors", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, // cuDNN Reduce Tensor functions {"cudnnCreateReduceTensorDescriptor", {"hipdnnCreateReduceTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}}, @@ -79,7 +80,10 @@ const std::map CUDA_DNN_FUNCTION_MAP{ {"cudnnGetFilter4dDescriptor", {"hipdnnGetFilter4dDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnSetFilterNdDescriptor", {"hipdnnSetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnGetFilterNdDescriptor", {"hipdnnGetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}}, + {"cudnnGetFilterSizeInBytes", {"hipdnnGetFilterSizeInBytes", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnTransformFilter", {"hipdnnTransformFilter", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", "", CONV_LIB_FUNC, API_DNN}}, + {"cudnnReorderFilterAndBias", {"hipdnnReorderFilterAndBias", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, // cuDNN Convolution functions {"cudnnCreateConvolutionDescriptor", {"hipdnnCreateConvolutionDescriptor", "", CONV_LIB_FUNC, API_DNN}}, @@ -87,6 +91,8 @@ const std::map CUDA_DNN_FUNCTION_MAP{ {"cudnnGetConvolutionMathType", {"hipdnnGetConvolutionMathType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnSetConvolutionGroupCount", {"hipdnnSetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnGetConvolutionGroupCount", {"hipdnnGetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetConvolutionReorderType", {"hipdnnSetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionReorderType", {"hipdnnGetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnSetConvolution2dDescriptor", {"hipdnnSetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnGetConvolution2dDescriptor", {"hipdnnGetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}}, {"cudnnGetConvolution2dForwardOutputDim", {"hipdnnGetConvolution2dForwardOutputDim", "", CONV_LIB_FUNC, API_DNN}}, @@ -235,7 +241,9 @@ const std::map CUDA_DNN_FUNCTION_MAP{ // cuDNN Connectionist Temporal Classification loss functions {"cudnnCreateCTCLossDescriptor", {"hipdnnCreateCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnSetCTCLossDescriptor", {"hipdnnSetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetCTCLossDescriptorEx", {"hipdnnSetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnGetCTCLossDescriptor", {"hipdnnGetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetCTCLossDescriptorEx", {"hipdnnGetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnDestroyCTCLossDescriptor", {"hipdnnDestroyCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnCTCLoss", {"hipdnnCTCLoss", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnGetCTCLossWorkspaceSize", {"hipdnnGetCTCLossWorkspaceSize", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, @@ -274,4 +282,18 @@ const std::map CUDA_DNN_FUNCTION_MAP{ {"cudnnMultiHeadAttnForward", {"hipdnnMultiHeadAttnForward", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnMultiHeadAttnBackwardData", {"hipdnnMultiHeadAttnBackwardData", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnMultiHeadAttnBackwardWeights", {"hipdnnMultiHeadAttnBackwardWeights", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + + // cuDNN Fuse functions + {"cudnnCreateFusedOpsConstParamPack", {"hipdnnCreateFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyFusedOpsConstParamPack", {"hipdnnDestroyFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetFusedOpsConstParamPackAttribute", {"hipdnnSetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetFusedOpsConstParamPackAttribute", {"hipdnnGetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateFusedOpsVariantParamPack", {"hipdnnCreateFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyFusedOpsVariantParamPack", {"hipdnnDestroyFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetFusedOpsVariantParamPackAttribute", {"hipdnnSetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetFusedOpsVariantParamPackAttribute", {"hipdnnGetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateFusedOpsPlan", {"hipdnnCreateFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyFusedOpsPlan", {"hipdnnDestroyFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnMakeFusedOpsPlan", {"hipdnnMakeFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsExecute", {"hipdnnFusedOpsExecute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}}, }; diff --git a/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_types.cpp b/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_types.cpp index 349e243082..a7d277e4c2 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_types.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_DNN_API_types.cpp @@ -240,6 +240,93 @@ const std::map CUDA_DNN_TYPE_NAME_MAP{ {"cudnnWgradMode_t", {"hipdnnWgradMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, {"CUDNN_WGRAD_MODE_ADD", {"HIPDNN_WGRAD_MODE_ADD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 {"CUDNN_WGRAD_MODE_SET", {"HIPDNN_WGRAD_MODE_SET", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"cudnnReorderType_t", {"hipdnnReorderType_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_DEFAULT_REORDER", {"HIPDNN_DEFAULT_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_NO_REORDER", {"HIPDNN_NO_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"cudnnLossNormalizationMode_t", {"hipdnnLossNormalizationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_LOSS_NORMALIZATION_NONE", {"HIPDNN_LOSS_NORMALIZATION_NONE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_LOSS_NORMALIZATION_SOFTMAX", {"HIPDNN_LOSS_NORMALIZATION_SOFTMAX", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"cudnnFusedOps_t", {"hipdnnFusedOps_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3 + {"CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", {"HIPDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4 + {"CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", {"HIPDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5 + {"CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", {"HIPDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6 + {"cudnnFusedOpsConstParamLabel_t", {"hipdnnFusedOpsConstParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_PARAM_XDESC", {"HIPDNN_PARAM_XDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_PARAM_XDATA_PLACEHOLDER", {"HIPDNN_PARAM_XDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_PARAM_BN_MODE", {"HIPDNN_PARAM_BN_MODE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"CUDNN_PARAM_BN_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3 + {"CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4 + {"CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5 + {"CUDNN_PARAM_ACTIVATION_DESC", {"HIPDNN_PARAM_ACTIVATION_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6 + {"CUDNN_PARAM_CONV_DESC", {"HIPDNN_PARAM_CONV_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7 + {"CUDNN_PARAM_WDESC", {"HIPDNN_PARAM_WDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8 + {"CUDNN_PARAM_WDATA_PLACEHOLDER", {"HIPDNN_PARAM_WDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9 + {"CUDNN_PARAM_DWDESC", {"HIPDNN_PARAM_DWDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10 + {"CUDNN_PARAM_DWDATA_PLACEHOLDER", {"HIPDNN_PARAM_DWDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11 + {"CUDNN_PARAM_YDESC", {"HIPDNN_PARAM_YDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12 + {"CUDNN_PARAM_YDATA_PLACEHOLDER", {"HIPDNN_PARAM_YDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13 + {"CUDNN_PARAM_DYDESC", {"HIPDNN_PARAM_DYDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14 + {"CUDNN_PARAM_DYDATA_PLACEHOLDER", {"HIPDNN_PARAM_DYDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15 + {"CUDNN_PARAM_YSTATS_DESC", {"HIPDNN_PARAM_YSTATS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16 + {"CUDNN_PARAM_YSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17 + {"CUDNN_PARAM_YSQSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSQSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18 + {"CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", {"HIPDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19 + {"CUDNN_PARAM_BN_SCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_SCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20 + {"CUDNN_PARAM_BN_BIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_BIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21 + {"CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22 + {"CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23 + {"CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 24 + {"CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 25 + {"CUDNN_PARAM_ZDESC", {"HIPDNN_PARAM_ZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 26 + {"CUDNN_PARAM_ZDATA_PLACEHOLDER", {"HIPDNN_PARAM_ZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 27 + {"CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 28 + {"CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 29 + {"CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 30 + {"CUDNN_PARAM_ACTIVATION_BITMASK_DESC", {"HIPDNN_PARAM_ACTIVATION_BITMASK_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 31 + {"CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", {"HIPDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 32 + {"CUDNN_PARAM_DXDESC", {"HIPDNN_PARAM_DXDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 33 + {"CUDNN_PARAM_DXDATA_PLACEHOLDER", {"HIPDNN_PARAM_DXDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 34 + {"CUDNN_PARAM_DZDESC", {"HIPDNN_PARAM_DZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 35 + {"CUDNN_PARAM_DZDATA_PLACEHOLDER", {"HIPDNN_PARAM_DZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 36 + {"CUDNN_PARAM_BN_DSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_DSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 37 + {"CUDNN_PARAM_BN_DBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_DBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 38 + {"cudnnFusedOpsPointerPlaceHolder_t", {"hipdnnActivationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_PTR_NULL", {"HIPDNN_ACTIVATION_SIGMOID", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_PTR_ELEM_ALIGNED", {"HIPDNN_ACTIVATION_RELU", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_PTR_16B_ALIGNED", {"HIPDNN_ACTIVATION_TANH", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"cudnnFusedOpsVariantParamLabel_t", {"hipdnnFusedOpsVariantParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_PTR_XDATA", {"HIPDNN_PTR_XDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_PTR_BN_EQSCALE", {"HIPDNN_PTR_BN_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_PTR_BN_EQBIAS", {"HIPDNN_PTR_BN_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"CUDNN_PTR_WDATA", {"HIPDNN_PTR_WDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3 + {"CUDNN_PTR_DWDATA", {"HIPDNN_PTR_DWDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4 + {"CUDNN_PTR_YDATA", {"HIPDNN_PTR_YDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5 + {"CUDNN_PTR_DYDATA", {"HIPDNN_PTR_DYDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6 + {"CUDNN_PTR_YSUM", {"HIPDNN_PTR_YSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7 + {"CUDNN_PTR_YSQSUM", {"HIPDNN_PTR_YSQSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8 + {"CUDNN_PTR_WORKSPACE", {"HIPDNN_PTR_WORKSPACE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9 + {"CUDNN_PTR_BN_SCALE", {"HIPDNN_PTR_BN_SCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10 + {"CUDNN_PTR_BN_BIAS", {"HIPDNN_PTR_BN_BIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11 + {"CUDNN_PTR_BN_SAVED_MEAN", {"HIPDNN_PTR_BN_SAVED_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12 + {"CUDNN_PTR_BN_SAVED_INVSTD", {"HIPDNN_PTR_BN_SAVED_INVSTD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13 + {"CUDNN_PTR_BN_RUNNING_MEAN", {"HIPDNN_PTR_BN_RUNNING_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14 + {"CUDNN_PTR_BN_RUNNING_VAR", {"HIPDNN_PTR_BN_RUNNING_VAR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15 + {"CUDNN_PTR_ZDATA", {"HIPDNN_PTR_ZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16 + {"CUDNN_PTR_BN_Z_EQSCALE", {"HIPDNN_PTR_BN_Z_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17 + {"CUDNN_PTR_BN_Z_EQBIAS", {"HIPDNN_PTR_BN_Z_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18 + {"CUDNN_PTR_ACTIVATION_BITMASK", {"HIPDNN_PTR_ACTIVATION_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19 + {"CUDNN_PTR_DXDATA", {"HIPDNN_PTR_DXDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20 + {"CUDNN_PTR_DZDATA", {"HIPDNN_PTR_DZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21 + {"CUDNN_PTR_BN_DSCALE", {"HIPDNN_PTR_BN_DSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22 + {"CUDNN_PTR_BN_DBIAS", {"HIPDNN_PTR_BN_DBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23 + {"CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", {"HIPDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 100 + {"CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", {"HIPDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 101 + {"CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", {"HIPDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 102 + {"CUDNN_SCALAR_DOUBLE_BN_EPSILON", {"HIPDNN_SCALAR_DOUBLE_BN_EPSILON", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 103 // cuDNN types {"cudnnContext", {"hipdnnContext", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, @@ -288,5 +375,10 @@ const std::map CUDA_DNN_TYPE_NAME_MAP{ {"cudnnSeqDataDescriptor_t", {"hipdnnSeqDataDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, {"cudnnAttnStruct", {"hipdnnAttnStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, {"cudnnAttnDescriptor_t", {"hipdnnAttnDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, - + {"cudnnFusedOpsConstParamStruct", {"hipdnnFusedOpsConstParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsConstParamPack_t", {"hipdnnFusedOpsConstParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsVariantParamStruct", {"hipdnnFusedOpsVariantParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsVariantParamPack_t", {"hipdnnFusedOpsVariantParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsPlanStruct", {"hipdnnFusedOpsPlanStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFusedOpsPlan_t", {"hipdnnFusedOpsPlan_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, }; diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp index b71e49710d..8be20774ea 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp @@ -175,8 +175,8 @@ const std::map CUDA_DRIVER_FUNCTION_MAP{ {"cuMemcpy", {"hipMemcpy_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}}, // no analogue // NOTE: Not equal to cudaMemcpy2D due to different signatures - {"cuMemcpy2D", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}}, - {"cuMemcpy2D_v2", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}}, + {"cuMemcpy2D", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}}, + {"cuMemcpy2D_v2", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}}, // no analogue // NOTE: Not equal to cudaMemcpy2DAsync due to different signatures {"cuMemcpy2DAsync", {"hipMemcpy2DAsync_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}}, diff --git a/hipamd/hipify-clang/src/HipifyAction.cpp b/hipamd/hipify-clang/src/HipifyAction.cpp index 5ea83c8376..241ca7ecae 100644 --- a/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/hipamd/hipify-clang/src/HipifyAction.cpp @@ -189,6 +189,9 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) { } return false; case CONV_INCLUDE: + if (hipToken.hipName.empty()) { + return true; + } switch (hipToken.apiType) { case API_RAND: if (hipToken.hipName == "hiprand_kernel.h") { diff --git a/hipamd/hipify-clang/src/LLVMCompat.cpp b/hipamd/hipify-clang/src/LLVMCompat.cpp index 87dedc630e..d2573ecf22 100644 --- a/hipamd/hipify-clang/src/LLVMCompat.cpp +++ b/hipamd/hipify-clang/src/LLVMCompat.cpp @@ -59,7 +59,11 @@ void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token * #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8) _pp.EnterTokenStream(start, len, false, DisableMacroExpansion); #else - _pp.EnterTokenStream(clang::ArrayRef{start, len}, DisableMacroExpansion); + #if (LLVM_VERSION_MAJOR < 9) + _pp.EnterTokenStream(clang::ArrayRef{start, len}, DisableMacroExpansion); + #else + _pp.EnterTokenStream(clang::ArrayRef{start, len}, DisableMacroExpansion, false); + #endif #endif } diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index e1f81e4547..808ed216e4 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -1009,14 +1009,20 @@ void __syncthreads() #define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG +/* + __smid returns the wave's assigned Compute Unit and Shader Engine. + The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4. + Note: the results vary over time. + SZ minus 1 since SIZE is 1-based. +*/ __device__ inline unsigned __smid(void) { unsigned cu_id = __builtin_amdgcn_s_getreg( - GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); + GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID)); unsigned se_id = __builtin_amdgcn_s_getreg( - GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); + GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID)); /* Each shader engine has 16 CU */ return (se_id << HW_ID_CU_ID_SIZE) + cu_id; diff --git a/hipamd/include/hip/hcc_detail/driver_types.h b/hipamd/include/hip/hcc_detail/driver_types.h index 8e1fec11fa..5b2297114f 100644 --- a/hipamd/include/hip/hcc_detail/driver_types.h +++ b/hipamd/include/hip/hcc_detail/driver_types.h @@ -80,22 +80,22 @@ typedef struct hipArray { }hipArray; typedef struct hip_Memcpy2D { - size_t height; - size_t widthInBytes; - hipArray* dstArray; - hipDeviceptr_t dstDevice; - void* dstHost; - hipMemoryType dstMemoryType; - size_t dstPitch; - size_t dstXInBytes; - size_t dstY; - hipArray* srcArray; - hipDeviceptr_t srcDevice; - const void* srcHost; - hipMemoryType srcMemoryType; - size_t srcPitch; size_t srcXInBytes; size_t srcY; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hipArray* srcArray; + size_t srcPitch; + size_t dstXInBytes; + size_t dstY; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hipArray* dstArray; + size_t dstPitch; + size_t WidthInBytes; + size_t Height; } hip_Memcpy2D; diff --git a/hipamd/include/hip/hcc_detail/hip_complex.h b/hipamd/include/hip/hcc_detail/hip_complex.h index 128e2d670b..75930c469e 100644 --- a/hipamd/include/hip/hcc_detail/hip_complex.h +++ b/hipamd/include/hip/hcc_detail/hip_complex.h @@ -120,51 +120,102 @@ THE SOFTWARE. ret.y = lhs.y * rhs; \ return ret; \ } -#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ComplexT, T) \ - explicit __device__ __host__ ComplexT(T val) : x(val), y(val) {} \ - __device__ __host__ ComplexT(T val1, T val2) : x(val1), y(val2) {} #endif -struct hipFloatComplex { -#ifdef __cplusplus - public: - typedef float value_type; - __device__ __host__ hipFloatComplex() : x(0.0f), y(0.0f) {} - explicit __device__ __host__ hipFloatComplex(float x) : x(x), y(0.0f) {} - __device__ __host__ hipFloatComplex(float x, float y) : x(x), y(y) {} - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long long) -#endif - float x, y; -} __attribute__((aligned(8))); +typedef float2 hipFloatComplex; + +__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; } + +__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; } + +__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) { + hipFloatComplex z; + z.x = a; + z.y = b; + return z; +} + +__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { + hipFloatComplex ret; + ret.x = z.x; + ret.y = -z.y; + return ret; +} + +__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) { + return z.x * z.x + z.y * z.y; +} + +__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x + q.x, p.y + q.y); +} + +__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x - q.x, p.y - q.y); +} + +__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) { + return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) { + float sqabs = hipCsqabsf(q); + hipFloatComplex ret; + ret.x = (p.x * q.x + p.y * q.y) / sqabs; + ret.y = (p.y * q.x - p.x * q.y) / sqabs; + return ret; +} + +__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); } + + +typedef double2 hipDoubleComplex; + +__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; } + +__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; } + +__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) { + hipDoubleComplex z; + z.x = a; + z.y = b; + return z; +} + +__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { + hipDoubleComplex ret; + ret.x = z.x; + ret.y = z.y; + return ret; +} + +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) { + return z.x * z.x + z.y * z.y; +} + +__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x + q.x, p.y + q.y); +} + +__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x - q.x, p.y - q.y); +} + +__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) { + return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) { + double sqabs = hipCsqabs(q); + hipDoubleComplex ret; + ret.x = (p.x * q.x + p.y * q.y) / sqabs; + ret.y = (p.y * q.x - p.x * q.y) / sqabs; + return ret; +} + +__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); } -struct hipDoubleComplex { -#ifdef __cplusplus - public: - typedef double value_type; - __device__ __host__ hipDoubleComplex() : x(0.0f), y(0.0f) {} - explicit __device__ __host__ hipDoubleComplex(double x) : x(x), y(0.0f) {} - __device__ __host__ hipDoubleComplex(double x, double y) : x(x), y(y) {} - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long long) -#endif - double x, y; -} __attribute__((aligned(16))); #if __cplusplus @@ -214,93 +265,6 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long) #endif -__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; } - -__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; } - -__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) { - hipFloatComplex z; - z.x = a; - z.y = b; - return z; -} - -__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { - hipFloatComplex ret; - ret.x = z.x; - ret.y = -z.y; - return ret; -} - -__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) { - return z.x * z.x + z.y * z.y; -} - -__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) { - return make_hipFloatComplex(p.x + q.x, p.y + q.y); -} - -__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) { - return make_hipFloatComplex(p.x - q.x, p.y - q.y); -} - -__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) { - return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); -} - -__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) { - float sqabs = hipCsqabsf(q); - hipFloatComplex ret; - ret.x = (p.x * q.x + p.y * q.y) / sqabs; - ret.y = (p.y * q.x - p.x * q.y) / sqabs; - return ret; -} - -__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); } - -__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; } - -__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; } - -__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) { - hipDoubleComplex z; - z.x = a; - z.y = b; - return z; -} - -__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { - hipDoubleComplex ret; - ret.x = z.x; - ret.y = z.y; - return ret; -} - -__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) { - return z.x * z.x + z.y * z.y; -} - -__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) { - return make_hipDoubleComplex(p.x + q.x, p.y + q.y); -} - -__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) { - return make_hipDoubleComplex(p.x - q.x, p.y - q.y); -} - -__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) { - return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); -} - -__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) { - double sqabs = hipCsqabs(q); - hipDoubleComplex ret; - ret.x = (p.x * q.x + p.y * q.y) / sqabs; - ret.y = (p.y * q.x - p.x * q.y) / sqabs; - return ret; -} - -__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); } typedef hipFloatComplex hipComplex; diff --git a/hipamd/include/hip/hcc_detail/hip_fp16.h b/hipamd/include/hip/hcc_detail/hip_fp16.h index 93ede207c2..74424a9f8b 100644 --- a/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -34,7 +34,7 @@ THE SOFTWARE. #include #endif -#if defined(__clang__) && (__clang_major__ > 5) +#if __HCC_OR_HIP_CLANG__ typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2))); struct __half_raw { diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 5b598b54a8..1b332cdb85 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -275,6 +275,15 @@ typedef struct dim3 { #endif } dim3; +typedef struct hipLaunchParams_t { + void* func; ///< Device function symbol + dim3 gridDim; ///< Grid dimentions + dim3 blockDim; ///< Block dimentions + void **args; ///< Arguments + size_t sharedMem; ///< Shared memory + hipStream_t stream; ///< Stream identifier +} hipLaunchParams; + // Doxygen end group GlobalDefs /** @} */ @@ -1858,6 +1867,16 @@ hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc* */ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); + +/** +* @brief Copies memory for 2D arrays. +* @param[in] pCopy Parameters for the memory copy + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, + * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, + * hipMemcpyToSymbol, hipMemcpyAsync +*/ hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy); /** @@ -2832,6 +2851,77 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne unsigned int sharedMemBytes, hipStream_t stream, void** kernelParams, void** extra); +/** + * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed + * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute + * + * @param [in] f Kernel to launch. + * @param [in] gridDim Grid dimensions specified as multiple of blockDim. + * @param [in] blockDim Block dimensions specified in work-items + * @param [in] kernelParams A list of kernel arguments + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The + * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th + * default stream is used with associated synchronization rules. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + */ +hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, + void** kernelParams, unsigned int sharedMemBytes, + hipStream_t stream); + +/** + * @brief Launches kernels on multiple devices where thread blocks can cooperate and + * synchronize as they execute. + * + * @param [in] hipLaunchParams List of launch parameters, one per device. + * @param [in] numDevices Size of the launchParamsList array. + * @param [in] flags Flags to control launch behavior. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + */ +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags); + +/** + * @brief Returns occupancy for a device function. + * + * @param [out] numBlocks Returned occupancy + * @param [in] func Kernel function for which occupancy is calulated + * @param [in] blockSize Block size the kernel is intended to be launched with + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes + */ +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize); + +/** + * @brief Returns occupancy for a device function. + * + * @param [out] numBlocks Returned occupancy + * @param [in] func Kernel function for which occupancy is calulated + * @param [in] blockSize Block size the kernel is intended to be launched with + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes + * @param [in] flags Extra flags for occupancy calculation (currently ignored) + */ +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched + * on respective streams before enqueuing any other work on the specified streams from any other threads + * + * + * @param [in] hipLaunchParams List of launch parameters, one per device. + * @param [in] numDevices Size of the launchParamsList array. + * @param [in] flags Flags to control launch behavior. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + */ +hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags); + + + // doxygen end Version Management /** * @} @@ -3160,6 +3250,40 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize); +} + +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags); +} + +template +inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, + void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { + return hipLaunchCooperativeKernel(reinterpret_cast(f), gridDim, + blockDim, kernelParams, sharedMemBytes, stream); +} + +template +inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + unsigned int numDevices, unsigned int flags = 0) { + return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags); +} + +template +inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, + unsigned int numDevices, unsigned int flags = 0) { + return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags); +} + + /* * @brief Unbinds the textuer bound to @p tex * diff --git a/hipamd/include/hip/hcc_detail/hip_texture_types.h b/hipamd/include/hip/hcc_detail/hip_texture_types.h index 0a68b507e8..b229f4e696 100644 --- a/hipamd/include/hip/hcc_detail/hip_texture_types.h +++ b/hipamd/include/hip/hcc_detail/hip_texture_types.h @@ -45,10 +45,15 @@ THE SOFTWARE. * * * * *******************************************************************************/ +#if __HIP__ +#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type)) +#else +#define __HIP_TEXTURE_ATTRIB +#endif template -struct texture : public textureReference { +struct __HIP_TEXTURE_ATTRIB texture : public textureReference { texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint, enum hipTextureAddressMode aMode = hipAddressModeClamp) { normalized = norm; diff --git a/hipamd/include/hip/hip_runtime_api.h b/hipamd/include/hip/hip_runtime_api.h index e7ecede8c1..e3c10766e9 100644 --- a/hipamd/include/hip/hip_runtime_api.h +++ b/hipamd/include/hip/hip_runtime_api.h @@ -115,6 +115,8 @@ typedef struct hipDeviceProp_t { int canMapHostMemory; ///< Check whether HIP can map host memory int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701 int integrated; ///< APU vs dGPU + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices } hipDeviceProp_t; @@ -291,6 +293,8 @@ typedef enum hipDeviceAttribute_t { ///< Multiprocessor. hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. hipDeviceAttributeIntegrated, ///< iGPU + hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch + hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices } hipDeviceAttribute_t; enum hipComputeMode { diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 2ebd4f8b7d..c1846c1b1e 100644 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -162,6 +162,7 @@ typedef CUdeviceptr hipDeviceptr_t; typedef struct cudaArray hipArray; typedef struct cudaArray* hipArray_const_t; typedef cudaFuncAttributes hipFuncAttributes; +#define hip_Memcpy2D CUDA_MEMCPY2D #define hipMemcpy3DParms cudaMemcpy3DParms #define hipArrayDefault cudaArrayDefault #define hipArrayLayered cudaArrayLayered @@ -578,6 +579,10 @@ inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); } +inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { + return hipCUResultTohipError(cuMemcpy2D(pCopy)); +} + inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) { return hipCUDAErrorTohipError(cudaMemcpy3D(p)); diff --git a/hipamd/packaging/hip_hcc.txt b/hipamd/packaging/hip_hcc.txt index 5aebc6c36d..365af8ef1e 100644 --- a/hipamd/packaging/hip_hcc.txt +++ b/hipamd/packaging/hip_hcc.txt @@ -3,7 +3,9 @@ project(hip_hcc) install(FILES @PROJECT_BINARY_DIR@/libhip_hcc.so DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/libhip_hcc_static.a DESTINATION lib) -install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib) +if(NOT @HIP_COMPILER@ STREQUAL "clang") + install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib) +endif() install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip) install(FILES @hip_SOURCE_DIR@/packaging/hip-targets.cmake @hip_SOURCE_DIR@/packaging/hip-targets-release.cmake DESTINATION lib/cmake/hip) diff --git a/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp index a6ba44696a..36d37a4fad 100644 --- a/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -71,8 +71,8 @@ bool runTest(int argc, char** argv) { copyParam.srcMemoryType = hipMemoryTypeHost; copyParam.srcHost = hData; copyParam.srcPitch = width * sizeof(float); - copyParam.widthInBytes = copyParam.srcPitch; - copyParam.height = height; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = height; hipMemcpyParam2D(©Param); textureReference* texref; diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 0fad8ab890..36edcdb338 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1715,8 +1715,8 @@ hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { if (pCopy == nullptr) { e = hipErrorInvalidValue; } - e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch, - pCopy->widthInBytes, pCopy->height, hipMemcpyDefault); + e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->WidthInBytes, pCopy->srcHost, pCopy->srcPitch, + pCopy->WidthInBytes, pCopy->Height, hipMemcpyDefault); return ihipLogStatus(e); } diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 91544b82a8..a602aa971a 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -59,6 +59,18 @@ using namespace std; static const size_t HIP_IMPLICIT_KERNARG_SIZE = 48; static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8; +struct amd_kernel_code_v3_t { + uint32_t group_segment_fixed_size; + uint32_t private_segment_fixed_size; + uint8_t reserved0[8]; + int64_t kernel_code_entry_byte_offset; + uint8_t reserved1[24]; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + uint16_t kernel_code_properties; + uint8_t reserved2[6]; +}; + // calculate MD5 checksum inline std::string checksum(size_t size, const char *source) { // FNV-1a hashing, 64-bit version @@ -191,7 +203,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, lp.dynamic_group_mem_bytes = sharedMemBytes; // TODO - this should be part of preLaunchKernel. hStream = ihipPreLaunchKernel( - hStream, dim3(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ), + hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ), dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str()); @@ -208,10 +220,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; - aql.group_segment_size = - f->_header->workgroup_group_segment_byte_size + sharedMemBytes; - aql.private_segment_size = - f->_header->workitem_private_segment_byte_size; + bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; + if (is_code_object_v3) { + const auto* header = + reinterpret_cast(f->_header); + aql.group_segment_size = + header->group_segment_fixed_size + sharedMemBytes; + aql.private_segment_size = + header->private_segment_fixed_size; + } else { + aql.group_segment_size = + f->_header->workgroup_group_segment_byte_size + sharedMemBytes; + aql.private_segment_size = + f->_header->workitem_private_segment_byte_size; + } aql.kernel_object = f->_object; aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; aql.header = @@ -464,6 +486,12 @@ hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const ch auto kernel = find_kernel_by_name(hmod->executable, name, agent); + if (kernel.handle == 0u) { + std::string name_str(name); + name_str.append(".kd"); + kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent); + } + if (kernel.handle == 0u) return hipErrorNotFound; // TODO: refactor the whole ihipThisThat, which is a mess and yields the @@ -488,7 +516,11 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod, } namespace { -hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) { +const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) { + return reinterpret_cast(kd._header); +} + +hipFuncAttributes make_function_attributes(const ihipModuleSymbol_t& kd) { hipFuncAttributes r{}; hipDeviceProp_t prop{}; @@ -497,16 +529,31 @@ hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) { // available per CU, therefore we hardcode it to 64 KiRegisters. prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - r.localSizeBytes = header.workitem_private_segment_byte_size; - r.sharedSizeBytes = header.workgroup_group_segment_byte_size; + bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos; + if (is_code_object_v3) { + r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; + r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; + } else { + r.localSizeBytes = kd._header->workitem_private_segment_byte_size; + r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; + } r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - r.numRegs = header.workitem_vgpr_count; + if (is_code_object_v3) { + r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2; + } else { + r.numRegs = kd._header->workitem_vgpr_count; + } r.maxThreadsPerBlock = r.numRegs ? std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : prop.maxThreadsPerBlock; - r.binaryVersion = - header.amd_machine_version_major * 10 + - header.amd_machine_version_minor; + if (is_code_object_v3) { + r.binaryVersion = 0; // FIXME: should it be the ISA version or code + // object format version? + } else { + r.binaryVersion = + kd._header->amd_machine_version_major * 10 + + kd._header->amd_machine_version_minor; + } r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. return r; @@ -522,11 +569,10 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) auto agent = this_agent(); auto kd = get_program_state().kernel_descriptor(reinterpret_cast(func), agent); - const auto header = kd->_header; - if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."}; + if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."}; - *attr = make_function_attributes(*header); + *attr = make_function_attributes(*kd); return hipSuccess; } @@ -557,11 +603,9 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) { (*module)->executable = get_program_state().load_executable( content.data(), content.size(), (*module)->executable, this_agent()); - istringstream elf{content}; - ELFIO::elfio reader; - if (reader.load(elf)) { - program_state_impl::read_kernarg_metadata(reader, (*module)->kernargs); - } + + std::vector blob(content.cbegin(), content.cend()); + program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs); // compute the hash of the code object (*module)->hash = checksum(content.length(), content.data()); diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index f1397b3fe9..639eac9228 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -540,9 +541,13 @@ public: std::call_once(functions[agent].first, [this](hsa_agent_t aa) { for (auto&& function : get_function_names()) { - const auto it = get_kernels(aa).find(function.second); + auto it = get_kernels(aa).find(function.second); - if (it == get_kernels(aa).cend()) continue; + if (it == get_kernels(aa).cend()) { + it = get_kernels(aa).find(function.second + ".kd"); + if (it == get_kernels(aa).cend()) + continue; + } for (auto&& kernel_symbol : it->second) { functions[aa].second.emplace( @@ -556,92 +561,172 @@ public: } static - std::size_t parse_args( - const std::string& metadata, - std::size_t f, - std::size_t l, + std::string metadata_to_string(const amd_comgr_metadata_node_t& md) { + std::string str; + size_t size; + + if (amd_comgr_get_metadata_string(md, &size, NULL) + == AMD_COMGR_STATUS_SUCCESS) { + str.resize(size - 1); + amd_comgr_get_metadata_string(md, &size, &str[0]); + } + return str; + } + + static + void parse_args( + const amd_comgr_metadata_node_t& args_md, + bool is_code_object_v3, std::vector>& size_align) { - if (f == l) return f; - if (!size_align.empty()) return l; + size_t arg_count = 0; + if (amd_comgr_get_metadata_list_size(args_md, &arg_count) + != AMD_COMGR_STATUS_SUCCESS) + return; - do { - static constexpr size_t size_sz{5}; - f = metadata.find("Size:", f) + size_sz; + for (size_t i = 0; i < arg_count; ++i) { + amd_comgr_metadata_node_t arg_md; - if (l <= f) return f; + if (amd_comgr_index_list_metadata(args_md, i, &arg_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - auto size = std::strtoul(&metadata[f], nullptr, 10); + amd_comgr_metadata_node_t arg_size_md; + if (amd_comgr_metadata_lookup(arg_md, + is_code_object_v3 ? ".size" : "Size", + &arg_size_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - static constexpr size_t align_sz{6}; - f = metadata.find("Align:", f) + align_sz; + size_t arg_size = std::stoul(metadata_to_string(arg_size_md)); - char* l{}; - auto align = std::strtoul(&metadata[f], &l, 10); + if (amd_comgr_destroy_metadata(arg_size_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - f += (l - &metadata[f]) + 1; + size_t arg_align; - size_align.emplace_back(size, align); - } while (true); + if (is_code_object_v3) { + amd_comgr_metadata_node_t arg_offset_md; + if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + + size_t arg_offset + = std::stoul(metadata_to_string(arg_offset_md)); + + if (amd_comgr_destroy_metadata(arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + + arg_align = 1; + while (arg_offset && (arg_offset & 1) == 0) { + arg_offset >>= 1; + arg_align <<= 1; + } + } else { + amd_comgr_metadata_node_t arg_align_md; + if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + + arg_align = std::stoul(metadata_to_string(arg_align_md)); + + if (amd_comgr_destroy_metadata(arg_align_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + } + + size_align.emplace_back(arg_size, arg_align); + + if (amd_comgr_destroy_metadata(arg_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + } } static void read_kernarg_metadata( - ELFIO::elfio& reader, + const std::vector& blob, std::unordered_map< std::string, std::vector>>& kernargs) { - // TODO: this is inefficient. - auto it = find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_NOTE; - }); + amd_comgr_data_t dataIn; + amd_comgr_status_t status; - if (!it) return; + if (amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &dataIn) + != AMD_COMGR_STATUS_SUCCESS) + return; - const ELFIO::note_section_accessor acc{reader, it}; - for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) { - ELFIO::Elf_Word type{}; - std::string name{}; - void* desc{}; - ELFIO::Elf_Word desc_size{}; + if (amd_comgr_set_data(dataIn, blob.size(), blob.data()) + != AMD_COMGR_STATUS_SUCCESS) + return; - acc.get_note(i, type, name, desc, desc_size); + amd_comgr_metadata_node_t metadata; + if (amd_comgr_get_data_metadata(dataIn, &metadata) + != AMD_COMGR_STATUS_SUCCESS) + return; - if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. - - std::string tmp{ - static_cast(desc), static_cast(desc) + desc_size}; - - auto dx = tmp.find("Kernels:"); - - if (dx == std::string::npos) continue; - - static constexpr decltype(tmp.size()) kernels_sz{8}; - dx += kernels_sz; - - do { - dx = tmp.find("Name:", dx); - - if (dx == std::string::npos) break; - - static constexpr decltype(tmp.size()) name_sz{5}; - dx = tmp.find_first_not_of(" '", dx + name_sz); - - auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx); - dx += fn.size(); - - auto dx1 = tmp.find("CodeProps", dx); - dx = tmp.find("Args:", dx); - - if (dx1 < dx) { - dx = dx1; - continue; - } - if (dx == std::string::npos) break; - - static constexpr decltype(tmp.size()) args_sz{5}; - dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]); - } while (true); + bool is_code_object_v3 = false; + amd_comgr_metadata_node_t kernels_md; + if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md) + != AMD_COMGR_STATUS_SUCCESS) { + if (amd_comgr_metadata_lookup(metadata, + "amdhsa.kernels", + &kernels_md) + != AMD_COMGR_STATUS_SUCCESS) + return; + is_code_object_v3 = true; } + + size_t kernel_count = 0; + if (amd_comgr_get_metadata_list_size(kernels_md, &kernel_count) + != AMD_COMGR_STATUS_SUCCESS) + return; + + for (size_t i = 0; i < kernel_count; i++) { + amd_comgr_metadata_node_t kernel_md; + + if (amd_comgr_index_list_metadata(kernels_md, i, &kernel_md) + != AMD_COMGR_STATUS_SUCCESS) + continue; + + amd_comgr_metadata_node_t name_md; + if (amd_comgr_metadata_lookup(kernel_md, + is_code_object_v3 ? ".name" : "Name", + &name_md) + != AMD_COMGR_STATUS_SUCCESS) + continue; + + std::string kernel_name_str = metadata_to_string(name_md); + + if (amd_comgr_destroy_metadata(name_md) + != AMD_COMGR_STATUS_SUCCESS) + continue; + + if (is_code_object_v3) + kernel_name_str.append(".kd"); + + + amd_comgr_metadata_node_t args_md; + if (amd_comgr_metadata_lookup(kernel_md, + is_code_object_v3 ? ".args" : "Args", + &args_md) + != AMD_COMGR_STATUS_SUCCESS) + continue; + + parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]); + + if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS + || amd_comgr_destroy_metadata(kernel_md) + != AMD_COMGR_STATUS_SUCCESS) + continue; + } + + if (amd_comgr_destroy_metadata(kernels_md) != AMD_COMGR_STATUS_SUCCESS + || amd_comgr_destroy_metadata(metadata) != AMD_COMGR_STATUS_SUCCESS) + return; + + amd_comgr_release_data(dataIn); } const std::unordered_mapsecond); if (it1 == get_kernargs().end()) { - hip_throw(std::runtime_error{ - "Missing metadata for __global__ function: " + it->second}); + it1 = get_kernargs().find(it->second + ".kd"); + if (it1 == get_kernargs().end()) { + hip_throw(std::runtime_error{ + "Missing metadata for __global__ function: " + it->second}); + } } return it1->second; diff --git a/hipamd/tests/hipify-clang/unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp b/hipamd/tests/hipify-clang/unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp new file mode 100644 index 0000000000..d5dffd0b09 --- /dev/null +++ b/hipamd/tests/hipify-clang/unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp @@ -0,0 +1,36 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +/* +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// CHECK: #include +#include +// CHECK-NOT: #include +#include + +// CHECK: extern texture tex; +extern texture tex; + +extern "C" __global__ void tex2dKernel(float* outputData, int width, int height) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + outputData[y * width + x] = tex2D(tex, x, y); +} diff --git a/hipamd/tests/src/hiprtc/hiprtcGetLoweredName.cpp b/hipamd/tests/src/hiprtc/hiprtcGetLoweredName.cpp index e3fa057a81..d9e6d71f93 100644 --- a/hipamd/tests/src/hiprtc/hiprtcGetLoweredName.cpp +++ b/hipamd/tests/src/hiprtc/hiprtcGetLoweredName.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t + * TEST: %t EXCLUDE_HIP_PLATFORM hcc * HIT_END */ #include diff --git a/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp b/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp index b0348408f3..7f49ea984b 100644 --- a/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp +++ b/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t + * TEST: %t EXCLUDE_HIP_PLATFORM hcc * HIT_END */ diff --git a/hipamd/tests/src/hiprtc/saxpy.cpp b/hipamd/tests/src/hiprtc/saxpy.cpp index 5f9dc7a125..437420266d 100644 --- a/hipamd/tests/src/hiprtc/saxpy.cpp +++ b/hipamd/tests/src/hiprtc/saxpy.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t + * TEST: %t EXCLUDE_HIP_PLATFORM hcc * HIT_END */ diff --git a/hipamd/tests/src/runtimeApi/module/vcpy_kernel.code b/hipamd/tests/src/runtimeApi/module/vcpy_kernel.code index 737ee1ce9b..4246151be1 100755 Binary files a/hipamd/tests/src/runtimeApi/module/vcpy_kernel.code and b/hipamd/tests/src/runtimeApi/module/vcpy_kernel.code differ diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c365dce70a..8160ac4bc3 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -178,7 +178,9 @@ void runTests(int64_t numElements) { { test(0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/); test(0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); +#ifndef __HIP_CLANG_ONLY__ test(0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); +#endif test(0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); // Sending a marker to to null stream may synchronize the otherStream