Merge branch 'master' into implicit-kernarg
Этот коммит содержится в:
@@ -271,6 +271,21 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
target_link_libraries(hiprtc PUBLIC stdc++fs)
|
||||
endif()
|
||||
|
||||
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
find_package(amd_comgr REQUIRED CONFIG
|
||||
PATHS
|
||||
/opt/rocm/
|
||||
PATH_SUFFIXES
|
||||
cmake/amd_comgr
|
||||
lib/cmake/amd_comgr
|
||||
)
|
||||
MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
|
||||
endif()
|
||||
|
||||
target_link_libraries(hip_hcc PRIVATE amd_comgr)
|
||||
target_link_libraries(hip_hcc_static PRIVATE amd_comgr)
|
||||
|
||||
string(REPLACE " " ";" HCC_CXX_FLAGS_LIST ${HCC_CXX_FLAGS})
|
||||
foreach(TARGET hip_hcc hip_hcc_static)
|
||||
target_include_directories(${TARGET} SYSTEM INTERFACE $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>;${HSA_PATH}/include)
|
||||
|
||||
+33
-26
@@ -107,24 +107,25 @@ $HIP_RUNTIME= $hipConfig{'HIP_RUNTIME'};
|
||||
|
||||
# If using VDI runtime, need to find HIP_VDI_HOME
|
||||
if ($HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) {
|
||||
$HIP_VDI_HOME = "/opt/rocm/hip"
|
||||
my $hipcc_dir = dirname($0);
|
||||
if (-e "$hipcc_dir/../lib/bitcode") {
|
||||
$HIP_VDI_HOME = abs_path($hipcc_dir . "/..");
|
||||
} else {
|
||||
$HIP_VDI_HOME = "/opt/rocm/hip";
|
||||
}
|
||||
}
|
||||
|
||||
if (defined $HIP_VDI_HOME) {
|
||||
my $bits = "";
|
||||
if (-d "$HIP_VDI_HOME/bin/x86_64") {
|
||||
$bits = "/x86_64";
|
||||
if (!defined $HIP_CLANG_PATH and (-e "$HIP_VDI_HOME/bin/clang" or -e "$HIP_VDI_HOME/bin/clang.exe")) {
|
||||
$HIP_CLANG_PATH = "$HIP_VDI_HOME/bin";
|
||||
$HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang";
|
||||
}
|
||||
if (!defined $HIP_CLANG_PATH) {
|
||||
$HIP_CLANG_PATH = "$HIP_VDI_HOME/bin" . $bits;
|
||||
if (!defined $DEVICE_LIB_PATH and -e "$HIP_VDI_HOME/lib/bitcode") {
|
||||
$DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib/bitcode";
|
||||
}
|
||||
if (!defined $DEVICE_LIB_PATH) {
|
||||
$DEVICE_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits . "/bitcode";
|
||||
}
|
||||
$HIP_CLANG_INCLUDE_PATH = "$HIP_VDI_HOME/include/clang";
|
||||
$HIP_INCLUDE_PATH = "$HIP_VDI_HOME/include";
|
||||
if (!defined $HIP_LIB_PATH) {
|
||||
$HIP_LIB_PATH = "$HIP_VDI_HOME/lib" . $bits;
|
||||
$HIP_LIB_PATH = "$HIP_VDI_HOME/lib";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,7 +170,7 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
$HIP_CLANG_VERSION=$1;
|
||||
|
||||
if (! defined $HIP_CLANG_INCLUDE_PATH) {
|
||||
$HIP_CLANG_INCLUDE_PATH = "$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include";
|
||||
$HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
|
||||
}
|
||||
if (! defined $HIP_INCLUDE_PATH) {
|
||||
$HIP_INCLUDE_PATH = "$HIP_PATH/include";
|
||||
@@ -370,7 +371,7 @@ if($HIP_PLATFORM eq "nvcc"){
|
||||
|
||||
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
|
||||
my $optArg = ""; # -O args
|
||||
my $rdc = 0;
|
||||
my $gArg = ""; # -g args
|
||||
|
||||
foreach $arg (@ARGV)
|
||||
{
|
||||
@@ -466,13 +467,9 @@ foreach $arg (@ARGV)
|
||||
{
|
||||
$optArg = $arg;
|
||||
}
|
||||
if($arg =~ /-fgpu-rdc/)
|
||||
if($arg =~ m/^-g/)
|
||||
{
|
||||
$rdc = 1;
|
||||
}
|
||||
if($arg =~ /-fno-gpu-rdc/)
|
||||
{
|
||||
$rdc = 0;
|
||||
$gArg = $arg;
|
||||
}
|
||||
|
||||
## process linker response file for hip-clang
|
||||
@@ -799,16 +796,12 @@ if ($needHipHcc) {
|
||||
|
||||
if ($HIP_PLATFORM eq "clang") {
|
||||
# Set default optimization level to -O3 for hip-clang.
|
||||
if ($optArg eq "") {
|
||||
if ($optArg eq "" and $gArg ne "-g") {
|
||||
$HIPCXXFLAGS .= " -O3";
|
||||
$HIPLDFLAGS .= " -O3";
|
||||
}
|
||||
$HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH";
|
||||
if ($rdc eq 0) {
|
||||
$HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
|
||||
} else {
|
||||
$HIPLDFLAGS .= " $HIP_DEVLIB_FLAGS";
|
||||
}
|
||||
$HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
|
||||
if ($isWindows) {
|
||||
$HIPCXXFLAGS .= " -std=c++14 -fms-extensions -fms-compatibility";
|
||||
} else {
|
||||
@@ -848,7 +841,21 @@ if ($runCmd) {
|
||||
print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using $HCC_HOME/hcc with version $HCC_VERSION from hipcc. Please rebuild HIP including cmake or update HCC_HOME variable.\n") ;
|
||||
die unless $ENV{'HIP_IGNORE_HCC_VERSION'};
|
||||
}
|
||||
system ("$CMD") or delete_temp_dirs () and die ();
|
||||
system ("$CMD");
|
||||
if ($? == -1) {
|
||||
print "failed to execute: $!\n";
|
||||
exit($?);
|
||||
}
|
||||
elsif ($? & 127) {
|
||||
printf "child died with signal %d, %s coredump\n",
|
||||
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||
exit($?);
|
||||
}
|
||||
else {
|
||||
$CMD_EXIT_CODE = $? >> 8;
|
||||
}
|
||||
$? or delete_temp_dirs ();
|
||||
exit($CMD_EXIT_CODE);
|
||||
}
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
|
||||
@@ -861,7 +861,7 @@
|
||||
| `cuMemAllocManaged` | |
|
||||
| `cuMemAllocPitch` | |
|
||||
| `cuMemcpy` | |
|
||||
| `cuMemcpy2D` | |
|
||||
| `cuMemcpy2D` | `hipMemcpyParam2D` |
|
||||
| `cuMemcpy2DAsync` | |
|
||||
| `cuMemcpy2DUnaligned` | |
|
||||
| `cuMemcpy3D` | |
|
||||
|
||||
@@ -246,6 +246,99 @@
|
||||
| enum |***`cudnnWgradMode_t`*** | |
|
||||
| 0 |*`CUDNN_WGRAD_MODE_ADD`* | |
|
||||
| 1 |*`CUDNN_WGRAD_MODE_SET`* | |
|
||||
| enum |***`cudnnReorderType_t`*** | |
|
||||
| 0 |*`CUDNN_DEFAULT_REORDER`* | |
|
||||
| 1 |*`CUDNN_NO_REORDER`* | |
|
||||
| enum |***`cudnnLossNormalizationMode_t`*** | |
|
||||
| 0 |*`CUDNN_LOSS_NORMALIZATION_NONE`* | |
|
||||
| 1 |*`CUDNN_LOSS_NORMALIZATION_SOFTMAX`* | |
|
||||
| struct |`cudnnFusedOpsConstParamStruct` | |
|
||||
| struct* |`cudnnFusedOpsConstParamPack_t` | |
|
||||
| struct |`cudnnFusedOpsVariantParamStruct` | |
|
||||
| struct* |`cudnnFusedOpsVariantParamPack_t` | |
|
||||
| struct |`cudnnFusedOpsPlanStruct` | |
|
||||
| struct* |`cudnnFusedOpsPlan_t` | |
|
||||
| enum |***`cudnnFusedOps_t`*** | |
|
||||
| 0 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS`* | |
|
||||
| 1 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD`* | |
|
||||
| 2 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING`* | |
|
||||
| 3 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE`* | |
|
||||
| 4 |*`CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION`* | |
|
||||
| 5 |*`CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK`* | |
|
||||
| 6 |*`CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM`* | |
|
||||
| enum |***`cudnnFusedOpsConstParamLabel_t`*** | |
|
||||
| 0 |*`CUDNN_PARAM_XDESC`* | |
|
||||
| 1 |*`CUDNN_PARAM_XDATA_PLACEHOLDER`* | |
|
||||
| 2 |*`CUDNN_PARAM_BN_MODE`* | |
|
||||
| 3 |*`CUDNN_PARAM_BN_EQSCALEBIAS_DESC`* | |
|
||||
| 4 |*`CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER`* | |
|
||||
| 5 |*`CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER`* | |
|
||||
| 6 |*`CUDNN_PARAM_ACTIVATION_DESC`* | |
|
||||
| 7 |*`CUDNN_PARAM_CONV_DESC`* | |
|
||||
| 8 |*`CUDNN_PARAM_WDESC`* | |
|
||||
| 9 |*`CUDNN_PARAM_WDATA_PLACEHOLDER`* | |
|
||||
| 10 |*`CUDNN_PARAM_DWDESC`* | |
|
||||
| 11 |*`CUDNN_PARAM_DWDATA_PLACEHOLDER`* | |
|
||||
| 12 |*`CUDNN_PARAM_YDESC`* | |
|
||||
| 13 |*`CUDNN_PARAM_YDATA_PLACEHOLDER`* | |
|
||||
| 14 |*`CUDNN_PARAM_DYDESC`* | |
|
||||
| 15 |*`CUDNN_PARAM_DYDATA_PLACEHOLDER`* | |
|
||||
| 16 |*`CUDNN_PARAM_YSTATS_DESC`* | |
|
||||
| 17 |*`CUDNN_PARAM_YSUM_PLACEHOLDER`* | |
|
||||
| 18 |*`CUDNN_PARAM_YSQSUM_PLACEHOLDER`* | |
|
||||
| 19 |*`CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC`* | |
|
||||
| 20 |*`CUDNN_PARAM_BN_SCALE_PLACEHOLDER`* | |
|
||||
| 21 |*`CUDNN_PARAM_BN_BIAS_PLACEHOLDER`* | |
|
||||
| 22 |*`CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER`* | |
|
||||
| 23 |*`CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER`* | |
|
||||
| 24 |*`CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER`* | |
|
||||
| 25 |*`CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER`* | |
|
||||
| 26 |*`CUDNN_PARAM_ZDESC`* | |
|
||||
| 27 |*`CUDNN_PARAM_ZDATA_PLACEHOLDER`* | |
|
||||
| 28 |*`CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC`* | |
|
||||
| 29 |*`CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER`* | |
|
||||
| 30 |*`CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER`* | |
|
||||
| 31 |*`CUDNN_PARAM_ACTIVATION_BITMASK_DESC`* | |
|
||||
| 32 |*`CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER`* | |
|
||||
| 33 |*`CUDNN_PARAM_DXDESC`* | |
|
||||
| 34 |*`CUDNN_PARAM_DXDATA_PLACEHOLDER`* | |
|
||||
| 35 |*`CUDNN_PARAM_DZDESC`* | |
|
||||
| 36 |*`CUDNN_PARAM_DZDATA_PLACEHOLDER`* | |
|
||||
| 37 |*`CUDNN_PARAM_BN_DSCALE_PLACEHOLDER`* | |
|
||||
| 38 |*`CUDNN_PARAM_BN_DBIAS_PLACEHOLDER`* | |
|
||||
| enum |***`cudnnFusedOpsPointerPlaceHolder_t`*** | |
|
||||
| 0 |*`CUDNN_PTR_NULL`* | |
|
||||
| 1 |*`CUDNN_PTR_ELEM_ALIGNED`* | |
|
||||
| 2 |*`CUDNN_PTR_16B_ALIGNED`* | |
|
||||
| enum |***`cudnnFusedOpsVariantParamLabel_t`*** | |
|
||||
| 0 |*`CUDNN_PTR_XDATA`* | |
|
||||
| 1 |*`CUDNN_PTR_BN_EQSCALE`* | |
|
||||
| 2 |*`CUDNN_PTR_BN_EQBIAS`* | |
|
||||
| 3 |*`CUDNN_PTR_WDATA`* | |
|
||||
| 4 |*`CUDNN_PTR_DWDATA`* | |
|
||||
| 5 |*`CUDNN_PTR_YDATA`* | |
|
||||
| 6 |*`CUDNN_PTR_DYDATA`* | |
|
||||
| 7 |*`CUDNN_PTR_YSUM`* | |
|
||||
| 8 |*`CUDNN_PTR_YSQSUM`* | |
|
||||
| 9 |*`CUDNN_PTR_WORKSPACE`* | |
|
||||
| 10 |*`CUDNN_PTR_BN_SCALE`* | |
|
||||
| 11 |*`CUDNN_PTR_BN_BIAS`* | |
|
||||
| 12 |*`CUDNN_PTR_BN_SAVED_MEAN`* | |
|
||||
| 13 |*`CUDNN_PTR_BN_SAVED_INVSTD`* | |
|
||||
| 14 |*`CUDNN_PTR_BN_RUNNING_MEAN`* | |
|
||||
| 15 |*`CUDNN_PTR_BN_RUNNING_VAR`* | |
|
||||
| 16 |*`CUDNN_PTR_ZDATA`* | |
|
||||
| 17 |*`CUDNN_PTR_BN_Z_EQSCALE`* | |
|
||||
| 18 |*`CUDNN_PTR_BN_Z_EQBIAS`* | |
|
||||
| 19 |*`CUDNN_PTR_ACTIVATION_BITMASK`* | |
|
||||
| 20 |*`CUDNN_PTR_DXDATA`* | |
|
||||
| 21 |*`CUDNN_PTR_DZDATA`* | |
|
||||
| 22 |*`CUDNN_PTR_BN_DSCALE`* | |
|
||||
| 23 |*`CUDNN_PTR_BN_DBIAS`* | |
|
||||
| 100 |*`CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES`* | |
|
||||
| 101 |*`CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT`* | |
|
||||
| 102 |*`CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR`* | |
|
||||
| 103 |*`CUDNN_SCALAR_DOUBLE_BN_EPSILON`* | |
|
||||
|
||||
## **2. CUDNN API functions**
|
||||
|
||||
@@ -282,6 +375,7 @@
|
||||
|`cudnnGetOpTensorDescriptor` |`hipdnnGetOpTensorDescriptor` |
|
||||
|`cudnnDestroyOpTensorDescriptor` |`hipdnnDestroyOpTensorDescriptor` |
|
||||
|`cudnnOpTensor` |`hipdnnOpTensor` |
|
||||
|`cudnnGetFoldedConvBackwardDataDescriptors` | |
|
||||
|`cudnnCreateReduceTensorDescriptor` |`hipdnnCreateReduceTensorDescriptor` |
|
||||
|`cudnnSetReduceTensorDescriptor` |`hipdnnSetReduceTensorDescriptor` |
|
||||
|`cudnnGetReduceTensorDescriptor` |`hipdnnGetReduceTensorDescriptor` |
|
||||
@@ -296,12 +390,17 @@
|
||||
|`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` |
|
||||
|`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` |
|
||||
|`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` |
|
||||
|`cudnnGetFilterSizeInBytes` | |
|
||||
|`cudnnTransformFilter` | |
|
||||
|`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` |
|
||||
|`cudnnReorderFilterAndBias` | |
|
||||
|`cudnnCreateConvolutionDescriptor` |`hipdnnCreateConvolutionDescriptor` |
|
||||
|`cudnnSetConvolutionMathType` |`hipdnnSetConvolutionMathType` |
|
||||
|`cudnnGetConvolutionMathType` | |
|
||||
|`cudnnSetConvolutionGroupCount` |`hipdnnSetConvolutionGroupCount` |
|
||||
|`cudnnGetConvolutionGroupCount` | |
|
||||
|`cudnnSetConvolutionReorderType` | |
|
||||
|`cudnnGetConvolutionReorderType` | |
|
||||
|`cudnnSetConvolution2dDescriptor` |`hipdnnSetConvolution2dDescriptor` |
|
||||
|`cudnnGetConvolution2dDescriptor` |`hipdnnGetConvolution2dDescriptor` |
|
||||
|`cudnnGetConvolution2dForwardOutputDim` |`hipdnnGetConvolution2dForwardOutputDim` |
|
||||
@@ -424,7 +523,9 @@
|
||||
|`cudnnGetRNNBiasMode` | |
|
||||
|`cudnnCreateCTCLossDescriptor` | |
|
||||
|`cudnnSetCTCLossDescriptor` | |
|
||||
|`cudnnSetCTCLossDescriptorEx` | |
|
||||
|`cudnnGetCTCLossDescriptor` | |
|
||||
|`cudnnGetCTCLossDescriptorEx` | |
|
||||
|`cudnnDestroyCTCLossDescriptor` | |
|
||||
|`cudnnCTCLoss` | |
|
||||
|`cudnnGetCTCLossWorkspaceSize` | |
|
||||
@@ -462,3 +563,15 @@
|
||||
|`cudnnMultiHeadAttnForward` | |
|
||||
|`cudnnMultiHeadAttnBackwardData` | |
|
||||
|`cudnnMultiHeadAttnBackwardWeights` | |
|
||||
|`cudnnCreateFusedOpsConstParamPack` | |
|
||||
|`cudnnDestroyFusedOpsConstParamPack` | |
|
||||
|`cudnnSetFusedOpsConstParamPackAttribute` | |
|
||||
|`cudnnGetFusedOpsConstParamPackAttribute` | |
|
||||
|`cudnnCreateFusedOpsVariantParamPack` | |
|
||||
|`cudnnDestroyFusedOpsVariantParamPack` | |
|
||||
|`cudnnSetFusedOpsVariantParamPackAttribute` | |
|
||||
|`cudnnGetFusedOpsVariantParamPackAttribute` | |
|
||||
|`cudnnCreateFusedOpsPlan` | |
|
||||
|`cudnnDestroyFusedOpsPlan` | |
|
||||
|`cudnnMakeFusedOpsPlan` | |
|
||||
|`cudnnFusedOpsExecute` | |
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
- [Warp Cross-Lane Functions](#warp-cross-lane-functions)
|
||||
* [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions)
|
||||
* [Warp Shuffle Functions](#warp-shuffle-functions)
|
||||
- [Cooperative Groups Functions](#cooperative-groups-functions)
|
||||
- [Warp Matrix Functions](#warp-matrix-functions)
|
||||
- [Independent Thread Scheduling](#independent-thread-scheduling)
|
||||
- [Profiler Counter Function](#profiler-counter-function)
|
||||
- [Assert](#assert)
|
||||
- [Printf](#printf)
|
||||
@@ -599,6 +602,70 @@ float __shfl_xor (float var, int laneMask, int width=warpSize);
|
||||
|
||||
```
|
||||
|
||||
## Cooperative Groups Functions
|
||||
|
||||
Cooperative groups is a mechanism for forming and communicating between groups of threads at
|
||||
a granularity different than the block. This feature was introduced in Cuda 9.
|
||||
|
||||
HIP does not support any of the kernel language cooperative groups
|
||||
types or functions.
|
||||
|
||||
|
||||
| **Function** | **Supported in HIP** | **Supported in CUDA** |
|
||||
| --- | --- | --- |
|
||||
| `void thread_group.sync()` | | ✓ |
|
||||
| `unsigned thread_group.size()` | | ✓ |
|
||||
| `unsigned thread_group.thread_rank()` | | ✓ |
|
||||
| `bool thread_group.is_valid()` | | ✓ |
|
||||
| `thread_group tiled_partition(thread_group, size)` | | ✓ |
|
||||
| `thread_block_tile<N> tiled_partition<N>(thread_group)` | | ✓ |
|
||||
| `thread_block this_thread_block()` | | ✓ |
|
||||
| `T thread_block_tile.shfl()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_down()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_up()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_xor()` | | ✓ |
|
||||
| `T thread_block_tile.any()` | | ✓ |
|
||||
| `T thread_block_tile.all()` | | ✓ |
|
||||
| `T thread_block_tile.ballot()` | | ✓ |
|
||||
| `T thread_block_tile.match_any()` | | ✓ |
|
||||
| `T thread_block_tile.match_all()` | | ✓ |
|
||||
| `coalesced_group coalesced_threads()` | | ✓ |
|
||||
| `grid_group this_grid()` | | ✓ |
|
||||
| `void grid_group.sync()` | | ✓ |
|
||||
| `unsigned grid_group.size()` | | ✓ |
|
||||
| `unsigned grid_group.thread_rank()` | | ✓ |
|
||||
| `bool grid_group.is_valid()` | | ✓ |
|
||||
| `multi_grid_group this_multi_grid()` | | ✓ |
|
||||
| `void multi_grid_group.sync()` | | ✓ |
|
||||
| `unsigned multi_grid_group.size()` | | ✓ |
|
||||
| `unsigned multi_grid_group.thread_rank()` | | ✓ |
|
||||
| `bool multi_grid_group.is_valid()` | | ✓ |
|
||||
|
||||
## Warp Matrix Functions
|
||||
|
||||
Warp matrix functions allow a warp to cooperatively operate on small matrices
|
||||
whose elements are spread over the lanes in an unspecified manner. This feature
|
||||
was introduced in Cuda 9.
|
||||
|
||||
HIP does not support any of the kernel language warp matrix
|
||||
types or functions.
|
||||
|
||||
| **Function** | **Supported in HIP** | **Supported in CUDA** |
|
||||
| --- | --- | --- |
|
||||
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)` | | ✓ |
|
||||
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout)` | | ✓ |
|
||||
| `void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)` | | ✓ |
|
||||
| `void fill_fragment(fragment<...> &a, const T &value)` | | ✓ |
|
||||
| `void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c , bool sat)` | | ✓ |
|
||||
|
||||
## Independent Thread Scheduling
|
||||
|
||||
The hardware support for independent thread scheduling introduced in certain architectures
|
||||
supporting Cuda allows threads to progress independently of each other and enables
|
||||
intra-warp synchronizations that were previously not allowed.
|
||||
|
||||
HIP does not support this type of scheduling.
|
||||
|
||||
## Profiler Counter Function
|
||||
|
||||
The Cuda `__prof_trigger()` instruction is not supported.
|
||||
|
||||
@@ -142,9 +142,9 @@ To run it:
|
||||
|
||||
* Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option:
|
||||
|
||||
- Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10`
|
||||
- Linux: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64`
|
||||
|
||||
- Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10`
|
||||
- Windows: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64`
|
||||
|
||||
5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
|
||||
|
||||
@@ -178,9 +178,9 @@ To run it:
|
||||
|
||||
On Linux the following configurations are tested:
|
||||
|
||||
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.5.1.10
|
||||
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.0.64
|
||||
|
||||
Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.5.1.10
|
||||
Ubuntu 16-18: LLVM 8.0.0, CUDA 8.0 - 10.0, cudnn-5.1.10 - cudnn-7.6.0.64
|
||||
|
||||
Build system for the above configurations:
|
||||
|
||||
@@ -195,7 +195,7 @@ cmake
|
||||
-DCMAKE_INSTALL_PREFIX=../dist \
|
||||
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/8.0.0/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 \
|
||||
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.5.1.10 \
|
||||
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.0-v7.6.0.64 \
|
||||
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/8.0.0/build/bin/llvm-lit \
|
||||
..
|
||||
```
|
||||
@@ -311,9 +311,9 @@ On Windows 10 the following configurations are tested:
|
||||
|
||||
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18
|
||||
|
||||
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.5.1.10
|
||||
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.0.64
|
||||
|
||||
LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.5.1.10
|
||||
LLVM 7.0.0 - 8.0.0 (with patch*), CUDA 7.5 - 10.0, cudnn-7.0.5.15 - cudnn-7.6.0.64
|
||||
|
||||
Build system for the above configurations:
|
||||
|
||||
@@ -330,7 +330,7 @@ cmake
|
||||
-DCMAKE_PREFIX_PATH=f:/LLVM/6.0.1/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0" \
|
||||
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v9.0" \
|
||||
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.5.1.10 \
|
||||
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-9.0-windows10-x64-v7.6.0.64 \
|
||||
-DLLVM_EXTERNAL_LIT=f:/LLVM/6.0.1/build/Release/bin/llvm-lit.py \
|
||||
-Thost=x64
|
||||
..
|
||||
|
||||
@@ -33,6 +33,7 @@ const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
|
||||
{"driver_types.h", {"hip/driver_types.h", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_fp16.h", {"hip/hip_fp16.h", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_texture_types.h", {"hip/hip_texture_types.h", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"texture_fetch_functions.h", {"", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"vector_types.h", {"hip/hip_vector_types.h", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_profiler_api.h", {"hip/hip_profile.h", "", CONV_INCLUDE, API_RUNTIME}},
|
||||
// cuComplex includes
|
||||
|
||||
@@ -61,6 +61,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
|
||||
{"cudnnGetOpTensorDescriptor", {"hipdnnGetOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnDestroyOpTensorDescriptor", {"hipdnnDestroyOpTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnOpTensor", {"hipdnnOpTensor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetFoldedConvBackwardDataDescriptors", {"hipdnnGetFoldedConvBackwardDataDescriptors", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
|
||||
// cuDNN Reduce Tensor functions
|
||||
{"cudnnCreateReduceTensorDescriptor", {"hipdnnCreateReduceTensorDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
@@ -79,7 +80,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
|
||||
{"cudnnGetFilter4dDescriptor", {"hipdnnGetFilter4dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnSetFilterNdDescriptor", {"hipdnnSetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetFilterNdDescriptor", {"hipdnnGetFilterNdDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetFilterSizeInBytes", {"hipdnnGetFilterSizeInBytes", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnTransformFilter", {"hipdnnTransformFilter", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnReorderFilterAndBias", {"hipdnnReorderFilterAndBias", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
|
||||
// cuDNN Convolution functions
|
||||
{"cudnnCreateConvolutionDescriptor", {"hipdnnCreateConvolutionDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
@@ -87,6 +91,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
|
||||
{"cudnnGetConvolutionMathType", {"hipdnnGetConvolutionMathType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetConvolutionGroupCount", {"hipdnnSetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetConvolutionGroupCount", {"hipdnnGetConvolutionGroupCount", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetConvolutionReorderType", {"hipdnnSetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetConvolutionReorderType", {"hipdnnGetConvolutionReorderType", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetConvolution2dDescriptor", {"hipdnnSetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetConvolution2dDescriptor", {"hipdnnGetConvolution2dDescriptor", "", CONV_LIB_FUNC, API_DNN}},
|
||||
{"cudnnGetConvolution2dForwardOutputDim", {"hipdnnGetConvolution2dForwardOutputDim", "", CONV_LIB_FUNC, API_DNN}},
|
||||
@@ -235,7 +241,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
|
||||
// cuDNN Connectionist Temporal Classification loss functions
|
||||
{"cudnnCreateCTCLossDescriptor", {"hipdnnCreateCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetCTCLossDescriptor", {"hipdnnSetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetCTCLossDescriptorEx", {"hipdnnSetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetCTCLossDescriptor", {"hipdnnGetCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetCTCLossDescriptorEx", {"hipdnnGetCTCLossDescriptorEx", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnDestroyCTCLossDescriptor", {"hipdnnDestroyCTCLossDescriptor", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnCTCLoss", {"hipdnnCTCLoss", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetCTCLossWorkspaceSize", {"hipdnnGetCTCLossWorkspaceSize", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
@@ -274,4 +282,18 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_FUNCTION_MAP{
|
||||
{"cudnnMultiHeadAttnForward", {"hipdnnMultiHeadAttnForward", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnMultiHeadAttnBackwardData", {"hipdnnMultiHeadAttnBackwardData", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnMultiHeadAttnBackwardWeights", {"hipdnnMultiHeadAttnBackwardWeights", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
|
||||
// cuDNN Fuse functions
|
||||
{"cudnnCreateFusedOpsConstParamPack", {"hipdnnCreateFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnDestroyFusedOpsConstParamPack", {"hipdnnDestroyFusedOpsConstParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetFusedOpsConstParamPackAttribute", {"hipdnnSetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetFusedOpsConstParamPackAttribute", {"hipdnnGetFusedOpsConstParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnCreateFusedOpsVariantParamPack", {"hipdnnCreateFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnDestroyFusedOpsVariantParamPack", {"hipdnnDestroyFusedOpsVariantParamPack", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnSetFusedOpsVariantParamPackAttribute", {"hipdnnSetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnGetFusedOpsVariantParamPackAttribute", {"hipdnnGetFusedOpsVariantParamPackAttribute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnCreateFusedOpsPlan", {"hipdnnCreateFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnDestroyFusedOpsPlan", {"hipdnnDestroyFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnMakeFusedOpsPlan", {"hipdnnMakeFusedOpsPlan", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsExecute", {"hipdnnFusedOpsExecute", "", CONV_LIB_FUNC, API_DNN, HIP_UNSUPPORTED}},
|
||||
};
|
||||
|
||||
@@ -240,6 +240,93 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_TYPE_NAME_MAP{
|
||||
{"cudnnWgradMode_t", {"hipdnnWgradMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_WGRAD_MODE_ADD", {"HIPDNN_WGRAD_MODE_ADD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_WGRAD_MODE_SET", {"HIPDNN_WGRAD_MODE_SET", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"cudnnReorderType_t", {"hipdnnReorderType_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_DEFAULT_REORDER", {"HIPDNN_DEFAULT_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_NO_REORDER", {"HIPDNN_NO_REORDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"cudnnLossNormalizationMode_t", {"hipdnnLossNormalizationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_LOSS_NORMALIZATION_NONE", {"HIPDNN_LOSS_NORMALIZATION_NONE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_LOSS_NORMALIZATION_SOFTMAX", {"HIPDNN_LOSS_NORMALIZATION_SOFTMAX", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"cudnnFusedOps_t", {"hipdnnFusedOps_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", {"HIPDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
|
||||
{"CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", {"HIPDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
|
||||
{"CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", {"HIPDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
|
||||
{"CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", {"HIPDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
|
||||
{"CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", {"HIPDNN_FUSED_DACTIVATION_FORK_DBATCHNORM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
|
||||
{"cudnnFusedOpsConstParamLabel_t", {"hipdnnFusedOpsConstParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_PARAM_XDESC", {"HIPDNN_PARAM_XDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_PARAM_XDATA_PLACEHOLDER", {"HIPDNN_PARAM_XDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"CUDNN_PARAM_BN_MODE", {"HIPDNN_PARAM_BN_MODE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
|
||||
{"CUDNN_PARAM_BN_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
|
||||
{"CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
|
||||
{"CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
|
||||
{"CUDNN_PARAM_ACTIVATION_DESC", {"HIPDNN_PARAM_ACTIVATION_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
|
||||
{"CUDNN_PARAM_CONV_DESC", {"HIPDNN_PARAM_CONV_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7
|
||||
{"CUDNN_PARAM_WDESC", {"HIPDNN_PARAM_WDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8
|
||||
{"CUDNN_PARAM_WDATA_PLACEHOLDER", {"HIPDNN_PARAM_WDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9
|
||||
{"CUDNN_PARAM_DWDESC", {"HIPDNN_PARAM_DWDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10
|
||||
{"CUDNN_PARAM_DWDATA_PLACEHOLDER", {"HIPDNN_PARAM_DWDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11
|
||||
{"CUDNN_PARAM_YDESC", {"HIPDNN_PARAM_YDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12
|
||||
{"CUDNN_PARAM_YDATA_PLACEHOLDER", {"HIPDNN_PARAM_YDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13
|
||||
{"CUDNN_PARAM_DYDESC", {"HIPDNN_PARAM_DYDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14
|
||||
{"CUDNN_PARAM_DYDATA_PLACEHOLDER", {"HIPDNN_PARAM_DYDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15
|
||||
{"CUDNN_PARAM_YSTATS_DESC", {"HIPDNN_PARAM_YSTATS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16
|
||||
{"CUDNN_PARAM_YSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17
|
||||
{"CUDNN_PARAM_YSQSUM_PLACEHOLDER", {"HIPDNN_PARAM_YSQSUM_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18
|
||||
{"CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", {"HIPDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19
|
||||
{"CUDNN_PARAM_BN_SCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_SCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20
|
||||
{"CUDNN_PARAM_BN_BIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_BIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21
|
||||
{"CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22
|
||||
{"CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", {"HIPDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23
|
||||
{"CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 24
|
||||
{"CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", {"HIPDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 25
|
||||
{"CUDNN_PARAM_ZDESC", {"HIPDNN_PARAM_ZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 26
|
||||
{"CUDNN_PARAM_ZDATA_PLACEHOLDER", {"HIPDNN_PARAM_ZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 27
|
||||
{"CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", {"HIPDNN_PARAM_BN_Z_EQSCALEBIAS_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 28
|
||||
{"CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 29
|
||||
{"CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 30
|
||||
{"CUDNN_PARAM_ACTIVATION_BITMASK_DESC", {"HIPDNN_PARAM_ACTIVATION_BITMASK_DESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 31
|
||||
{"CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", {"HIPDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 32
|
||||
{"CUDNN_PARAM_DXDESC", {"HIPDNN_PARAM_DXDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 33
|
||||
{"CUDNN_PARAM_DXDATA_PLACEHOLDER", {"HIPDNN_PARAM_DXDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 34
|
||||
{"CUDNN_PARAM_DZDESC", {"HIPDNN_PARAM_DZDESC", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 35
|
||||
{"CUDNN_PARAM_DZDATA_PLACEHOLDER", {"HIPDNN_PARAM_DZDATA_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 36
|
||||
{"CUDNN_PARAM_BN_DSCALE_PLACEHOLDER", {"HIPDNN_PARAM_BN_DSCALE_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 37
|
||||
{"CUDNN_PARAM_BN_DBIAS_PLACEHOLDER", {"HIPDNN_PARAM_BN_DBIAS_PLACEHOLDER", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 38
|
||||
{"cudnnFusedOpsPointerPlaceHolder_t", {"hipdnnActivationMode_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_PTR_NULL", {"HIPDNN_ACTIVATION_SIGMOID", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_PTR_ELEM_ALIGNED", {"HIPDNN_ACTIVATION_RELU", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"CUDNN_PTR_16B_ALIGNED", {"HIPDNN_ACTIVATION_TANH", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
|
||||
{"cudnnFusedOpsVariantParamLabel_t", {"hipdnnFusedOpsVariantParamLabel_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"CUDNN_PTR_XDATA", {"HIPDNN_PTR_XDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUDNN_PTR_BN_EQSCALE", {"HIPDNN_PTR_BN_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1
|
||||
{"CUDNN_PTR_BN_EQBIAS", {"HIPDNN_PTR_BN_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2
|
||||
{"CUDNN_PTR_WDATA", {"HIPDNN_PTR_WDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3
|
||||
{"CUDNN_PTR_DWDATA", {"HIPDNN_PTR_DWDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 4
|
||||
{"CUDNN_PTR_YDATA", {"HIPDNN_PTR_YDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5
|
||||
{"CUDNN_PTR_DYDATA", {"HIPDNN_PTR_DYDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6
|
||||
{"CUDNN_PTR_YSUM", {"HIPDNN_PTR_YSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7
|
||||
{"CUDNN_PTR_YSQSUM", {"HIPDNN_PTR_YSQSUM", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8
|
||||
{"CUDNN_PTR_WORKSPACE", {"HIPDNN_PTR_WORKSPACE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 9
|
||||
{"CUDNN_PTR_BN_SCALE", {"HIPDNN_PTR_BN_SCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 10
|
||||
{"CUDNN_PTR_BN_BIAS", {"HIPDNN_PTR_BN_BIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 11
|
||||
{"CUDNN_PTR_BN_SAVED_MEAN", {"HIPDNN_PTR_BN_SAVED_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12
|
||||
{"CUDNN_PTR_BN_SAVED_INVSTD", {"HIPDNN_PTR_BN_SAVED_INVSTD", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13
|
||||
{"CUDNN_PTR_BN_RUNNING_MEAN", {"HIPDNN_PTR_BN_RUNNING_MEAN", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 14
|
||||
{"CUDNN_PTR_BN_RUNNING_VAR", {"HIPDNN_PTR_BN_RUNNING_VAR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 15
|
||||
{"CUDNN_PTR_ZDATA", {"HIPDNN_PTR_ZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16
|
||||
{"CUDNN_PTR_BN_Z_EQSCALE", {"HIPDNN_PTR_BN_Z_EQSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 17
|
||||
{"CUDNN_PTR_BN_Z_EQBIAS", {"HIPDNN_PTR_BN_Z_EQBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 18
|
||||
{"CUDNN_PTR_ACTIVATION_BITMASK", {"HIPDNN_PTR_ACTIVATION_BITMASK", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 19
|
||||
{"CUDNN_PTR_DXDATA", {"HIPDNN_PTR_DXDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 20
|
||||
{"CUDNN_PTR_DZDATA", {"HIPDNN_PTR_DZDATA", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 21
|
||||
{"CUDNN_PTR_BN_DSCALE", {"HIPDNN_PTR_BN_DSCALE", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 22
|
||||
{"CUDNN_PTR_BN_DBIAS", {"HIPDNN_PTR_BN_DBIAS", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 23
|
||||
{"CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", {"HIPDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 100
|
||||
{"CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", {"HIPDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 101
|
||||
{"CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", {"HIPDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 102
|
||||
{"CUDNN_SCALAR_DOUBLE_BN_EPSILON", {"HIPDNN_SCALAR_DOUBLE_BN_EPSILON", "", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 103
|
||||
|
||||
// cuDNN types
|
||||
{"cudnnContext", {"hipdnnContext", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
@@ -288,5 +375,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DNN_TYPE_NAME_MAP{
|
||||
{"cudnnSeqDataDescriptor_t", {"hipdnnSeqDataDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnAttnStruct", {"hipdnnAttnStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnAttnDescriptor_t", {"hipdnnAttnDescriptor_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
|
||||
{"cudnnFusedOpsConstParamStruct", {"hipdnnFusedOpsConstParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsConstParamPack_t", {"hipdnnFusedOpsConstParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsVariantParamStruct", {"hipdnnFusedOpsVariantParamStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsVariantParamPack_t", {"hipdnnFusedOpsVariantParamPack_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsPlanStruct", {"hipdnnFusedOpsPlanStruct", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
{"cudnnFusedOpsPlan_t", {"hipdnnFusedOpsPlan_t", "", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}},
|
||||
};
|
||||
|
||||
@@ -175,8 +175,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{
|
||||
{"cuMemcpy", {"hipMemcpy_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
|
||||
// no analogue
|
||||
// NOTE: Not equal to cudaMemcpy2D due to different signatures
|
||||
{"cuMemcpy2D", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
|
||||
{"cuMemcpy2D_v2", {"hipMemcpy2D_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
|
||||
{"cuMemcpy2D", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}},
|
||||
{"cuMemcpy2D_v2", {"hipMemcpyParam2D", "", CONV_MEMORY, API_DRIVER}},
|
||||
// no analogue
|
||||
// NOTE: Not equal to cudaMemcpy2DAsync due to different signatures
|
||||
{"cuMemcpy2DAsync", {"hipMemcpy2DAsync_", "", CONV_MEMORY, API_DRIVER, HIP_UNSUPPORTED}},
|
||||
|
||||
@@ -189,6 +189,9 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
|
||||
}
|
||||
return false;
|
||||
case CONV_INCLUDE:
|
||||
if (hipToken.hipName.empty()) {
|
||||
return true;
|
||||
}
|
||||
switch (hipToken.apiType) {
|
||||
case API_RAND:
|
||||
if (hipToken.hipName == "hiprand_kernel.h") {
|
||||
|
||||
@@ -59,7 +59,11 @@ void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token *
|
||||
#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8)
|
||||
_pp.EnterTokenStream(start, len, false, DisableMacroExpansion);
|
||||
#else
|
||||
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
|
||||
#if (LLVM_VERSION_MAJOR < 9)
|
||||
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
|
||||
#else
|
||||
_pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion, false);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -1009,14 +1009,20 @@ void __syncthreads()
|
||||
|
||||
#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
|
||||
|
||||
/*
|
||||
__smid returns the wave's assigned Compute Unit and Shader Engine.
|
||||
The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
|
||||
Note: the results vary over time.
|
||||
SZ minus 1 since SIZE is 1-based.
|
||||
*/
|
||||
__device__
|
||||
inline
|
||||
unsigned __smid(void)
|
||||
{
|
||||
unsigned cu_id = __builtin_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
unsigned se_id = __builtin_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
|
||||
GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
|
||||
|
||||
/* Each shader engine has 16 CU */
|
||||
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
|
||||
|
||||
@@ -80,22 +80,22 @@ typedef struct hipArray {
|
||||
}hipArray;
|
||||
|
||||
typedef struct hip_Memcpy2D {
|
||||
size_t height;
|
||||
size_t widthInBytes;
|
||||
hipArray* dstArray;
|
||||
hipDeviceptr_t dstDevice;
|
||||
void* dstHost;
|
||||
hipMemoryType dstMemoryType;
|
||||
size_t dstPitch;
|
||||
size_t dstXInBytes;
|
||||
size_t dstY;
|
||||
hipArray* srcArray;
|
||||
hipDeviceptr_t srcDevice;
|
||||
const void* srcHost;
|
||||
hipMemoryType srcMemoryType;
|
||||
size_t srcPitch;
|
||||
size_t srcXInBytes;
|
||||
size_t srcY;
|
||||
hipMemoryType srcMemoryType;
|
||||
const void* srcHost;
|
||||
hipDeviceptr_t srcDevice;
|
||||
hipArray* srcArray;
|
||||
size_t srcPitch;
|
||||
size_t dstXInBytes;
|
||||
size_t dstY;
|
||||
hipMemoryType dstMemoryType;
|
||||
void* dstHost;
|
||||
hipDeviceptr_t dstDevice;
|
||||
hipArray* dstArray;
|
||||
size_t dstPitch;
|
||||
size_t WidthInBytes;
|
||||
size_t Height;
|
||||
} hip_Memcpy2D;
|
||||
|
||||
|
||||
|
||||
@@ -120,51 +120,102 @@ THE SOFTWARE.
|
||||
ret.y = lhs.y * rhs; \
|
||||
return ret; \
|
||||
}
|
||||
#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ComplexT, T) \
|
||||
explicit __device__ __host__ ComplexT(T val) : x(val), y(val) {} \
|
||||
__device__ __host__ ComplexT(T val1, T val2) : x(val1), y(val2) {}
|
||||
|
||||
#endif
|
||||
|
||||
struct hipFloatComplex {
|
||||
#ifdef __cplusplus
|
||||
public:
|
||||
typedef float value_type;
|
||||
__device__ __host__ hipFloatComplex() : x(0.0f), y(0.0f) {}
|
||||
explicit __device__ __host__ hipFloatComplex(float x) : x(x), y(0.0f) {}
|
||||
__device__ __host__ hipFloatComplex(float x, float y) : x(x), y(y) {}
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned short)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed short)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned int)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed int)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, double)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long long)
|
||||
#endif
|
||||
float x, y;
|
||||
} __attribute__((aligned(8)));
|
||||
typedef float2 hipFloatComplex;
|
||||
|
||||
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
||||
|
||||
|
||||
typedef double2 hipDoubleComplex;
|
||||
|
||||
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
|
||||
|
||||
struct hipDoubleComplex {
|
||||
#ifdef __cplusplus
|
||||
public:
|
||||
typedef double value_type;
|
||||
__device__ __host__ hipDoubleComplex() : x(0.0f), y(0.0f) {}
|
||||
explicit __device__ __host__ hipDoubleComplex(double x) : x(x), y(0.0f) {}
|
||||
__device__ __host__ hipDoubleComplex(double x, double y) : x(x), y(y) {}
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned short)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed short)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned int)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed int)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, float)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long long)
|
||||
MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long long)
|
||||
#endif
|
||||
double x, y;
|
||||
} __attribute__((aligned(16)));
|
||||
|
||||
#if __cplusplus
|
||||
|
||||
@@ -214,93 +265,6 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
|
||||
|
||||
#endif
|
||||
|
||||
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
||||
|
||||
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
|
||||
|
||||
typedef hipFloatComplex hipComplex;
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) && (__clang_major__ > 5)
|
||||
#if __HCC_OR_HIP_CLANG__
|
||||
typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
|
||||
|
||||
struct __half_raw {
|
||||
|
||||
@@ -275,6 +275,15 @@ typedef struct dim3 {
|
||||
#endif
|
||||
} dim3;
|
||||
|
||||
typedef struct hipLaunchParams_t {
|
||||
void* func; ///< Device function symbol
|
||||
dim3 gridDim; ///< Grid dimentions
|
||||
dim3 blockDim; ///< Block dimentions
|
||||
void **args; ///< Arguments
|
||||
size_t sharedMem; ///< Shared memory
|
||||
hipStream_t stream; ///< Stream identifier
|
||||
} hipLaunchParams;
|
||||
|
||||
|
||||
// Doxygen end group GlobalDefs
|
||||
/** @} */
|
||||
@@ -1858,6 +1867,16 @@ hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc*
|
||||
*/
|
||||
hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
|
||||
size_t height, hipMemcpyKind kind);
|
||||
|
||||
/**
|
||||
* @brief Copies memory for 2D arrays.
|
||||
* @param[in] pCopy Parameters for the memory copy
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
|
||||
* #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
|
||||
*
|
||||
* @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
|
||||
* hipMemcpyToSymbol, hipMemcpyAsync
|
||||
*/
|
||||
hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
|
||||
|
||||
/**
|
||||
@@ -2832,6 +2851,77 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
|
||||
unsigned int sharedMemBytes, hipStream_t stream,
|
||||
void** kernelParams, void** extra);
|
||||
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
|
||||
* to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
|
||||
*
|
||||
* @param [in] f Kernel to launch.
|
||||
* @param [in] gridDim Grid dimensions specified as multiple of blockDim.
|
||||
* @param [in] blockDim Block dimensions specified in work-items
|
||||
* @param [in] kernelParams A list of kernel arguments
|
||||
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
|
||||
* kernel can access this with HIP_DYNAMIC_SHARED.
|
||||
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
|
||||
* default stream is used with associated synchronization rules.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*/
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
|
||||
void** kernelParams, unsigned int sharedMemBytes,
|
||||
hipStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Launches kernels on multiple devices where thread blocks can cooperate and
|
||||
* synchronize as they execute.
|
||||
*
|
||||
* @param [in] hipLaunchParams List of launch parameters, one per device.
|
||||
* @param [in] numDevices Size of the launchParamsList array.
|
||||
* @param [in] flags Flags to control launch behavior.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*/
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags);
|
||||
|
||||
/**
|
||||
* @brief Returns occupancy for a device function.
|
||||
*
|
||||
* @param [out] numBlocks Returned occupancy
|
||||
* @param [in] func Kernel function for which occupancy is calulated
|
||||
* @param [in] blockSize Block size the kernel is intended to be launched with
|
||||
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);
|
||||
|
||||
/**
|
||||
* @brief Returns occupancy for a device function.
|
||||
*
|
||||
* @param [out] numBlocks Returned occupancy
|
||||
* @param [in] func Kernel function for which occupancy is calulated
|
||||
* @param [in] blockSize Block size the kernel is intended to be launched with
|
||||
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
|
||||
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);
|
||||
|
||||
/**
|
||||
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
|
||||
* on respective streams before enqueuing any other work on the specified streams from any other threads
|
||||
*
|
||||
*
|
||||
* @param [in] hipLaunchParams List of launch parameters, one per device.
|
||||
* @param [in] numDevices Size of the launchParamsList array.
|
||||
* @param [in] flags Flags to control launch behavior.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*/
|
||||
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags);
|
||||
|
||||
|
||||
|
||||
// doxygen end Version Management
|
||||
/**
|
||||
* @}
|
||||
@@ -3160,6 +3250,40 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize, flags);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim,
|
||||
blockDim, kernelParams, sharedMemBytes, stream);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
unsigned int numDevices, unsigned int flags = 0) {
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
unsigned int numDevices, unsigned int flags = 0) {
|
||||
return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* @brief Unbinds the textuer bound to @p tex
|
||||
*
|
||||
|
||||
@@ -45,10 +45,15 @@ THE SOFTWARE.
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
#if __HIP__
|
||||
#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
|
||||
#else
|
||||
#define __HIP_TEXTURE_ATTRIB
|
||||
#endif
|
||||
|
||||
template <class T, int texType = hipTextureType1D,
|
||||
enum hipTextureReadMode mode = hipReadModeElementType>
|
||||
struct texture : public textureReference {
|
||||
struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
|
||||
texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
|
||||
enum hipTextureAddressMode aMode = hipAddressModeClamp) {
|
||||
normalized = norm;
|
||||
|
||||
@@ -115,6 +115,8 @@ typedef struct hipDeviceProp_t {
|
||||
int canMapHostMemory; ///< Check whether HIP can map host memory
|
||||
int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701
|
||||
int integrated; ///< APU vs dGPU
|
||||
int cooperativeLaunch; ///< HIP device supports cooperative launch
|
||||
int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices
|
||||
} hipDeviceProp_t;
|
||||
|
||||
|
||||
@@ -291,6 +293,8 @@ typedef enum hipDeviceAttribute_t {
|
||||
///< Multiprocessor.
|
||||
hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices.
|
||||
hipDeviceAttributeIntegrated, ///< iGPU
|
||||
hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
|
||||
} hipDeviceAttribute_t;
|
||||
|
||||
enum hipComputeMode {
|
||||
|
||||
@@ -162,6 +162,7 @@ typedef CUdeviceptr hipDeviceptr_t;
|
||||
typedef struct cudaArray hipArray;
|
||||
typedef struct cudaArray* hipArray_const_t;
|
||||
typedef cudaFuncAttributes hipFuncAttributes;
|
||||
#define hip_Memcpy2D CUDA_MEMCPY2D
|
||||
#define hipMemcpy3DParms cudaMemcpy3DParms
|
||||
#define hipArrayDefault cudaArrayDefault
|
||||
#define hipArrayLayered cudaArrayLayered
|
||||
@@ -578,6 +579,10 @@ inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src,
|
||||
cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
|
||||
return hipCUResultTohipError(cuMemcpy2D(pCopy));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
|
||||
{
|
||||
return hipCUDAErrorTohipError(cudaMemcpy3D(p));
|
||||
|
||||
@@ -3,7 +3,9 @@ project(hip_hcc)
|
||||
|
||||
install(FILES @PROJECT_BINARY_DIR@/libhip_hcc.so DESTINATION lib)
|
||||
install(FILES @PROJECT_BINARY_DIR@/libhip_hcc_static.a DESTINATION lib)
|
||||
install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib)
|
||||
if(NOT @HIP_COMPILER@ STREQUAL "clang")
|
||||
install(FILES @PROJECT_BINARY_DIR@/libhiprtc.so DESTINATION lib)
|
||||
endif()
|
||||
install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib)
|
||||
install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip)
|
||||
install(FILES @hip_SOURCE_DIR@/packaging/hip-targets.cmake @hip_SOURCE_DIR@/packaging/hip-targets-release.cmake DESTINATION lib/cmake/hip)
|
||||
|
||||
@@ -71,8 +71,8 @@ bool runTest(int argc, char** argv) {
|
||||
copyParam.srcMemoryType = hipMemoryTypeHost;
|
||||
copyParam.srcHost = hData;
|
||||
copyParam.srcPitch = width * sizeof(float);
|
||||
copyParam.widthInBytes = copyParam.srcPitch;
|
||||
copyParam.height = height;
|
||||
copyParam.WidthInBytes = copyParam.srcPitch;
|
||||
copyParam.Height = height;
|
||||
hipMemcpyParam2D(©Param);
|
||||
|
||||
textureReference* texref;
|
||||
|
||||
@@ -1715,8 +1715,8 @@ hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
|
||||
if (pCopy == nullptr) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch,
|
||||
pCopy->widthInBytes, pCopy->height, hipMemcpyDefault);
|
||||
e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->WidthInBytes, pCopy->srcHost, pCopy->srcPitch,
|
||||
pCopy->WidthInBytes, pCopy->Height, hipMemcpyDefault);
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
@@ -59,6 +59,18 @@ using namespace std;
|
||||
static const size_t HIP_IMPLICIT_KERNARG_SIZE = 48;
|
||||
static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8;
|
||||
|
||||
struct amd_kernel_code_v3_t {
|
||||
uint32_t group_segment_fixed_size;
|
||||
uint32_t private_segment_fixed_size;
|
||||
uint8_t reserved0[8];
|
||||
int64_t kernel_code_entry_byte_offset;
|
||||
uint8_t reserved1[24];
|
||||
uint32_t compute_pgm_rsrc1;
|
||||
uint32_t compute_pgm_rsrc2;
|
||||
uint16_t kernel_code_properties;
|
||||
uint8_t reserved2[6];
|
||||
};
|
||||
|
||||
// calculate MD5 checksum
|
||||
inline std::string checksum(size_t size, const char *source) {
|
||||
// FNV-1a hashing, 64-bit version
|
||||
@@ -191,7 +203,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
lp.dynamic_group_mem_bytes =
|
||||
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
|
||||
hStream = ihipPreLaunchKernel(
|
||||
hStream, dim3(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ),
|
||||
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
|
||||
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str());
|
||||
|
||||
|
||||
@@ -208,10 +220,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
aql.grid_size_x = globalWorkSizeX;
|
||||
aql.grid_size_y = globalWorkSizeY;
|
||||
aql.grid_size_z = globalWorkSizeZ;
|
||||
aql.group_segment_size =
|
||||
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
|
||||
aql.private_segment_size =
|
||||
f->_header->workitem_private_segment_byte_size;
|
||||
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
const auto* header =
|
||||
reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
aql.group_segment_size =
|
||||
header->group_segment_fixed_size + sharedMemBytes;
|
||||
aql.private_segment_size =
|
||||
header->private_segment_fixed_size;
|
||||
} else {
|
||||
aql.group_segment_size =
|
||||
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
|
||||
aql.private_segment_size =
|
||||
f->_header->workitem_private_segment_byte_size;
|
||||
}
|
||||
aql.kernel_object = f->_object;
|
||||
aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
aql.header =
|
||||
@@ -464,6 +486,12 @@ hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const ch
|
||||
|
||||
auto kernel = find_kernel_by_name(hmod->executable, name, agent);
|
||||
|
||||
if (kernel.handle == 0u) {
|
||||
std::string name_str(name);
|
||||
name_str.append(".kd");
|
||||
kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent);
|
||||
}
|
||||
|
||||
if (kernel.handle == 0u) return hipErrorNotFound;
|
||||
|
||||
// TODO: refactor the whole ihipThisThat, which is a mess and yields the
|
||||
@@ -488,7 +516,11 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
|
||||
}
|
||||
|
||||
namespace {
|
||||
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
|
||||
const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
|
||||
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
|
||||
}
|
||||
|
||||
hipFuncAttributes make_function_attributes(const ihipModuleSymbol_t& kd) {
|
||||
hipFuncAttributes r{};
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
@@ -497,16 +529,31 @@ hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
|
||||
// available per CU, therefore we hardcode it to 64 KiRegisters.
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
r.localSizeBytes = header.workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
|
||||
bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
} else {
|
||||
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.numRegs = header.workitem_vgpr_count;
|
||||
if (is_code_object_v3) {
|
||||
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
} else {
|
||||
r.numRegs = kd._header->workitem_vgpr_count;
|
||||
}
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
r.binaryVersion =
|
||||
header.amd_machine_version_major * 10 +
|
||||
header.amd_machine_version_minor;
|
||||
if (is_code_object_v3) {
|
||||
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
|
||||
// object format version?
|
||||
} else {
|
||||
r.binaryVersion =
|
||||
kd._header->amd_machine_version_major * 10 +
|
||||
kd._header->amd_machine_version_minor;
|
||||
}
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
@@ -522,11 +569,10 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
|
||||
|
||||
auto agent = this_agent();
|
||||
auto kd = get_program_state().kernel_descriptor(reinterpret_cast<uintptr_t>(func), agent);
|
||||
const auto header = kd->_header;
|
||||
|
||||
if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."};
|
||||
if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."};
|
||||
|
||||
*attr = make_function_attributes(*header);
|
||||
*attr = make_function_attributes(*kd);
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
@@ -557,11 +603,9 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
|
||||
(*module)->executable = get_program_state().load_executable(
|
||||
content.data(), content.size(), (*module)->executable,
|
||||
this_agent());
|
||||
istringstream elf{content};
|
||||
ELFIO::elfio reader;
|
||||
if (reader.load(elf)) {
|
||||
program_state_impl::read_kernarg_metadata(reader, (*module)->kernargs);
|
||||
}
|
||||
|
||||
std::vector<char> blob(content.cbegin(), content.cend());
|
||||
program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
|
||||
|
||||
// compute the hash of the code object
|
||||
(*module)->hash = checksum(content.length(), content.data());
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
#include <amd_comgr.h>
|
||||
|
||||
#include <link.h>
|
||||
|
||||
@@ -540,9 +541,13 @@ public:
|
||||
|
||||
std::call_once(functions[agent].first, [this](hsa_agent_t aa) {
|
||||
for (auto&& function : get_function_names()) {
|
||||
const auto it = get_kernels(aa).find(function.second);
|
||||
auto it = get_kernels(aa).find(function.second);
|
||||
|
||||
if (it == get_kernels(aa).cend()) continue;
|
||||
if (it == get_kernels(aa).cend()) {
|
||||
it = get_kernels(aa).find(function.second + ".kd");
|
||||
if (it == get_kernels(aa).cend())
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
functions[aa].second.emplace(
|
||||
@@ -556,92 +561,172 @@ public:
|
||||
}
|
||||
|
||||
static
|
||||
std::size_t parse_args(
|
||||
const std::string& metadata,
|
||||
std::size_t f,
|
||||
std::size_t l,
|
||||
std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
|
||||
std::string str;
|
||||
size_t size;
|
||||
|
||||
if (amd_comgr_get_metadata_string(md, &size, NULL)
|
||||
== AMD_COMGR_STATUS_SUCCESS) {
|
||||
str.resize(size - 1);
|
||||
amd_comgr_get_metadata_string(md, &size, &str[0]);
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static
|
||||
void parse_args(
|
||||
const amd_comgr_metadata_node_t& args_md,
|
||||
bool is_code_object_v3,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
size_t arg_count = 0;
|
||||
if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
for (size_t i = 0; i < arg_count; ++i) {
|
||||
amd_comgr_metadata_node_t arg_md;
|
||||
|
||||
if (l <= f) return f;
|
||||
if (amd_comgr_index_list_metadata(args_md, i, &arg_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
auto size = std::strtoul(&metadata[f], nullptr, 10);
|
||||
amd_comgr_metadata_node_t arg_size_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md,
|
||||
is_code_object_v3 ? ".size" : "Size",
|
||||
&arg_size_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
size_t arg_size = std::stoul(metadata_to_string(arg_size_md));
|
||||
|
||||
char* l{};
|
||||
auto align = std::strtoul(&metadata[f], &l, 10);
|
||||
if (amd_comgr_destroy_metadata(arg_size_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
size_t arg_align;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
if (is_code_object_v3) {
|
||||
amd_comgr_metadata_node_t arg_offset_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
size_t arg_offset
|
||||
= std::stoul(metadata_to_string(arg_offset_md));
|
||||
|
||||
if (amd_comgr_destroy_metadata(arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
arg_align = 1;
|
||||
while (arg_offset && (arg_offset & 1) == 0) {
|
||||
arg_offset >>= 1;
|
||||
arg_align <<= 1;
|
||||
}
|
||||
} else {
|
||||
amd_comgr_metadata_node_t arg_align_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
arg_align = std::stoul(metadata_to_string(arg_align_md));
|
||||
|
||||
if (amd_comgr_destroy_metadata(arg_align_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
}
|
||||
|
||||
size_align.emplace_back(arg_size, arg_align);
|
||||
|
||||
if (amd_comgr_destroy_metadata(arg_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void read_kernarg_metadata(
|
||||
ELFIO::elfio& reader,
|
||||
const std::vector<char>& blob,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
// TODO: this is inefficient.
|
||||
auto it = find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_NOTE;
|
||||
});
|
||||
amd_comgr_data_t dataIn;
|
||||
amd_comgr_status_t status;
|
||||
|
||||
if (!it) return;
|
||||
if (amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &dataIn)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
const ELFIO::note_section_accessor acc{reader, it};
|
||||
for (decltype(acc.get_notes_num()) i = 0; i != acc.get_notes_num(); ++i) {
|
||||
ELFIO::Elf_Word type{};
|
||||
std::string name{};
|
||||
void* desc{};
|
||||
ELFIO::Elf_Word desc_size{};
|
||||
if (amd_comgr_set_data(dataIn, blob.size(), blob.data())
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
acc.get_note(i, type, name, desc, desc_size);
|
||||
amd_comgr_metadata_node_t metadata;
|
||||
if (amd_comgr_get_data_metadata(dataIn, &metadata)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
std::string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == std::string::npos) continue;
|
||||
|
||||
static constexpr decltype(tmp.size()) kernels_sz{8};
|
||||
dx += kernels_sz;
|
||||
|
||||
do {
|
||||
dx = tmp.find("Name:", dx);
|
||||
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) name_sz{5};
|
||||
dx = tmp.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn = tmp.substr(dx, tmp.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = tmp.find("CodeProps", dx);
|
||||
dx = tmp.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(tmp.size()) args_sz{5};
|
||||
dx = parse_args(tmp, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
bool is_code_object_v3 = false;
|
||||
amd_comgr_metadata_node_t kernels_md;
|
||||
if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS) {
|
||||
if (amd_comgr_metadata_lookup(metadata,
|
||||
"amdhsa.kernels",
|
||||
&kernels_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
is_code_object_v3 = true;
|
||||
}
|
||||
|
||||
size_t kernel_count = 0;
|
||||
if (amd_comgr_get_metadata_list_size(kernels_md, &kernel_count)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
for (size_t i = 0; i < kernel_count; i++) {
|
||||
amd_comgr_metadata_node_t kernel_md;
|
||||
|
||||
if (amd_comgr_index_list_metadata(kernels_md, i, &kernel_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
amd_comgr_metadata_node_t name_md;
|
||||
if (amd_comgr_metadata_lookup(kernel_md,
|
||||
is_code_object_v3 ? ".name" : "Name",
|
||||
&name_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
std::string kernel_name_str = metadata_to_string(name_md);
|
||||
|
||||
if (amd_comgr_destroy_metadata(name_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
if (is_code_object_v3)
|
||||
kernel_name_str.append(".kd");
|
||||
|
||||
|
||||
amd_comgr_metadata_node_t args_md;
|
||||
if (amd_comgr_metadata_lookup(kernel_md,
|
||||
is_code_object_v3 ? ".args" : "Args",
|
||||
&args_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
|
||||
|
||||
if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
|
||||
|| amd_comgr_destroy_metadata(kernel_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (amd_comgr_destroy_metadata(kernels_md) != AMD_COMGR_STATUS_SUCCESS
|
||||
|| amd_comgr_destroy_metadata(metadata) != AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
amd_comgr_release_data(dataIn);
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string,
|
||||
@@ -651,13 +736,7 @@ public:
|
||||
for (auto&& name_and_isa_blobs : get_code_object_blobs()) {
|
||||
for (auto&& isa_blobs : name_and_isa_blobs.second) {
|
||||
for (auto&& blob : isa_blobs.second) {
|
||||
std::stringstream tmp{std::string{blob.cbegin(), blob.cend()}};
|
||||
|
||||
ELFIO::elfio reader;
|
||||
|
||||
if (!reader.load(tmp)) continue;
|
||||
|
||||
read_kernarg_metadata(reader, kernargs.second);
|
||||
read_kernarg_metadata(blob, kernargs.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -711,8 +790,11 @@ public:
|
||||
|
||||
auto it1 = get_kernargs().find(it->second);
|
||||
if (it1 == get_kernargs().end()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second});
|
||||
it1 = get_kernargs().find(it->second + ".kd");
|
||||
if (it1 == get_kernargs().end()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Missing metadata for __global__ function: " + it->second});
|
||||
}
|
||||
}
|
||||
|
||||
return it1->second;
|
||||
|
||||
+36
@@ -0,0 +1,36 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
|
||||
/*
|
||||
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda.h>
|
||||
// CHECK-NOT: #include <texture_fetch_functions.h>
|
||||
#include <texture_fetch_functions.h>
|
||||
|
||||
// CHECK: extern texture<float, 2, hipReadModeElementType> tex;
|
||||
extern texture<float, 2, cudaReadModeElementType> tex;
|
||||
|
||||
extern "C" __global__ void tex2dKernel(float* outputData, int width, int height) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
outputData[y * width + x] = tex2D(tex, x, y);
|
||||
}
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
|
||||
* HIT_END
|
||||
*/
|
||||
#include <test_common.h>
|
||||
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM hcc
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
Двоичные данные
Двоичный файл не отображается.
@@ -178,7 +178,9 @@ void runTests(int64_t numElements) {
|
||||
{
|
||||
test(0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/);
|
||||
test(0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/);
|
||||
#ifndef __HIP_CLANG_ONLY__
|
||||
test(0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/);
|
||||
#endif
|
||||
test(0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/);
|
||||
|
||||
// Sending a marker to to null stream may synchronize the otherStream
|
||||
|
||||
Ссылка в новой задаче
Block a user