diff --git a/CMakeLists.txt b/CMakeLists.txt index 46355b5a85..0a8c22ff54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,7 +200,7 @@ if(HIP_PLATFORM STREQUAL "hcc") execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx803 --amdgpu-target=gfx900") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906") if(COMPILE_HIP_ATP_MARKER) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger") endif() @@ -214,7 +214,10 @@ if(HIP_PLATFORM STREQUAL "hcc") foreach(TARGET hip_hcc hip_hcc_static hip_device) target_include_directories(${TARGET} SYSTEM INTERFACE $/include>;${HSA_PATH}/include) endforeach() - target_link_libraries(hip_hcc INTERFACE hcc::hccrt;hcc::hc_am) + add_library(host INTERFACE) + target_link_libraries(host INTERFACE hip_hcc) + add_library(device INTERFACE) + target_link_libraries(device INTERFACE host hip_device hcc::hccrt hcc::hc_am) # Generate .hipInfo file(WRITE "${PROJECT_BINARY_DIR}/.hipInfo" ${_buildInfo}) @@ -264,7 +267,7 @@ set(BIN_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/bin) set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip) if(HIP_PLATFORM STREQUAL "hcc") - install(TARGETS hip_hcc_static hip_hcc hip_device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) + install(TARGETS hip_hcc_static hip_hcc hip_device host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) install(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::) include(CMakePackageConfigHelpers) diff --git a/bin/hipcc b/bin/hipcc index f796b0bf95..7fafc30053 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -50,6 +50,8 @@ $verbose = $ENV{'HIPCC_VERBOSE'} // 0; # Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args $HIP_PATH=$ENV{'HIP_PATH'} // dirname (dirname $0); # use parent directory of hipcc +$HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'}; +$DEVICE_LIB_PATH=$ENV{'DEVICE_LIB_PATH'}; #--- # Read .hipInfo @@ -62,6 +64,10 @@ $HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform` // "hcc"; $HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`; ($HIP_VERSION_MAJOR, $HIP_VERSION_MINOR, $HIP_VERSION_PATCH) = split(/\./, $HIP_VERSION); +if (defined $HIP_CLANG_PATH) { + $HIP_PLATFORM = "clang" +} + if ($verbose & 0x2) { print ("HIP_PATH=$HIP_PATH\n"); print ("HIP_PLATFORM=$HIP_PLATFORM\n"); @@ -75,9 +81,19 @@ $target_gfx801 = 0; $target_gfx802 = 0; $target_gfx803 = 0; $target_gfx900 = 0; +$target_gfx906 = 0; $default_amdgpu_target = 1; -if ($HIP_PLATFORM eq "hcc") { +if ($HIP_PLATFORM eq "clang") { + if ($verbose & 0x2) { + print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n"); + } + $ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm"; + $HIPCC="$HIP_CLANG_PATH/clang++"; + $HIPCXXFLAGS .= "-std=c++11 -I$HIP_PATH/include"; + + $HIPLDFLAGS = "--hip-link --hip-device-lib-path=$DEVICE_LIB_PATH -L$HIP_PATH/lib -lhip_hcc"; +} elsif ($HIP_PLATFORM eq "hcc") { $HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa"; $HCC_HOME=$ENV{'HCC_HOME'} // $hipConfig{'HCC_HOME'} // "/opt/rocm/hcc"; @@ -281,6 +297,16 @@ foreach $arg (@ARGV) $target_gfx900 = 1; $default_amdgpu_target = 0; } + if($arg eq '--amdgpu-target=gfx906') + { + $target_gfx906 = 1; + $default_amdgpu_target = 0; + } + + # hip-clang does not accept --amdgpu-target= options. + if (($arg =~ /--amdgpu-target=/) and $HIP_PLATFORM eq 'clang' ) { + $swallowArg = 1; + } if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0)) { @@ -330,10 +356,16 @@ foreach $arg (@ARGV) if (($arg =~ /\.cpp$/) or ($arg =~ /\.c$/) or ($arg =~ /\.cc$/) ) { $hasC = 1; $needCXXFLAGS = 1; + if ($HIP_PLATFORM eq 'clang') { + $toolArgs .= " -x hip" + } } if (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/)) { $hasCU = 1; $needCXXFLAGS = 1; + if ($HIP_PLATFORM eq 'clang') { + $toolArgs .= " -x hip" + } } push (@inputs, $arg); @@ -342,7 +374,7 @@ foreach $arg (@ARGV) $toolArgs .= " $arg" unless $swallowArg; } -if($HIP_PLATFORM eq "hcc"){ +if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){ # No AMDGPU target specified at commandline. So look for HCC_AMDGPU_TARGET if($default_amdgpu_target eq 1 and defined $ENV{HCC_AMDGPU_TARGET}) { @@ -373,6 +405,11 @@ if($HIP_PLATFORM eq "hcc"){ $target_gfx900 = 1; $default_amdgpu_target = 0; } + if($target eq 'gfx906') + { + $target_gfx906 = 1; + $default_amdgpu_target = 0; + } } } # Else try using rocm_agent_enumerator @@ -404,6 +441,10 @@ if($HIP_PLATFORM eq "hcc"){ $target_gfx900 = 1; $default_amdgpu_target = 0; } + if($val eq "gfx906") { + $target_gfx906 = 1; + $default_amdgpu_target = 0; + } } } # rocm_agent_enumerator failed! Throw an error and die if linking is required @@ -414,29 +455,59 @@ if($HIP_PLATFORM eq "hcc"){ $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n"; + if($HIP_PLATFORM eq "hcc") { + $GPU_ARCH_OPT = " --amdgpu-target="; + } else { + $GPU_ARCH_OPT = " --cuda-gpu-arch="; + } # Handle ROCm target platform if ($target_gfx701 eq 1) { - $HIPLDFLAGS .= " --amdgpu-target=gfx701"; + $GPU_ARCH_ARG = $GPU_ARCH_OPT . "gfx701"; + $HIPLDFLAGS .= $GPU_ARCH_ARG; + if ($HIP_PLATFORM eq 'clang') { + $HIPCXXFLAGS .= $GPU_ARCH_ARG;; + } $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX701__=1 "; } if ($target_gfx801 eq 1) { - $HIPLDFLAGS .= " --amdgpu-target=gfx801"; + $GPU_ARCH_ARG = $GPU_ARCH_OPT . "gfx801"; + $HIPLDFLAGS .= $GPU_ARCH_ARG; + if ($HIP_PLATFORM eq 'clang') { + $HIPCXXFLAGS .= $GPU_ARCH_ARG;; + } $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX801__=1 "; } if ($target_gfx802 eq 1) { - $HIPLDFLAGS .= " --amdgpu-target=gfx802"; + $GPU_ARCH_ARG = $GPU_ARCH_OPT . "gfx802"; + $HIPLDFLAGS .= $GPU_ARCH_ARG; + if ($HIP_PLATFORM eq 'clang') { + $HIPCXXFLAGS .= $GPU_ARCH_ARG;; + } $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX802__=1 "; } if ($target_gfx803 eq 1) { - $HIPLDFLAGS .= " --amdgpu-target=gfx803"; + $GPU_ARCH_ARG = $GPU_ARCH_OPT . "gfx803"; + $HIPLDFLAGS .= $GPU_ARCH_ARG; + if ($HIP_PLATFORM eq 'clang') { + $HIPCXXFLAGS .= $GPU_ARCH_ARG;; + } $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX803__=1 "; $ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; } if ($target_gfx900 eq 1) { - $HIPLDFLAGS .= " --amdgpu-target=gfx900"; + $GPU_ARCH_ARG = $GPU_ARCH_OPT . "gfx900"; + $HIPLDFLAGS .= $GPU_ARCH_ARG; + if ($HIP_PLATFORM eq 'clang') { + $HIPCXXFLAGS .= $GPU_ARCH_ARG;; + } $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 "; $ENV{HCC_EXTRA_LIBRARIES_GFX900}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; } + if ($target_gfx906 eq 1) { + $HIPLDFLAGS .= " --amdgpu-target=gfx906"; + $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX906__=1 "; + $ENV{HCC_EXTRA_LIBRARIES_GFX906}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; + } } if ($hasC and $HIP_PLATFORM eq 'nvcc') { @@ -445,6 +516,7 @@ if ($hasC and $HIP_PLATFORM eq 'nvcc') { if ($hasCU and $HIP_PLATFORM eq 'hcc') { $HIPCXXFLAGS .= " -x c++"; } + if ($buildDeps and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -M -D__CUDACC__"; } diff --git a/hip-config.cmake.in b/hip-config.cmake.in index 7e4468b94a..efcdf708bb 100644 --- a/hip-config.cmake.in +++ b/hip-config.cmake.in @@ -51,7 +51,7 @@ set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig") find_dependency(hcc) include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" ) -set( hip_LIBRARIES hip::hip_hcc) +set( hip_LIBRARIES hip::host hip::device) set( hip_LIBRARY ${hip_LIBRARIES}) set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR}) diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp index 6aa9e733f2..7d10b35e48 100644 --- a/hipify-clang/src/CUDA2HipMap.cpp +++ b/hipify-clang/src/CUDA2HipMap.cpp @@ -2920,6 +2920,18 @@ const std::map CUDA_IDENTIFIER_MAP{ // unchanged function names: skipahead, skipahead_sequence, skipahead_subsequence ///////////////////////////// cuDNN ///////////////////////////// + // defines + {"CUDNN_VERSION", {"HIPDNN_VERSION", CONV_NUMERIC_LITERAL, API_DNN}}, // 7000 + {"CUDNN_DIM_MAX", {"HIPDNN_DIM_MAX", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 8 + {"CUDNN_LRN_MIN_N", {"HIPDNN_LRN_MIN_N", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_LRN_MAX_N", {"HIPDNN_LRN_MAX_N", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 16 + {"CUDNN_LRN_MIN_K", {"HIPDNN_LRN_MIN_K", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1e-5 + {"CUDNN_LRN_MIN_BETA", {"HIPDNN_LRN_MIN_BETA", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0.01 + {"CUDNN_BN_MIN_EPSILON", {"HIPDNN_BN_MIN_EPSILON", CONV_NUMERIC_LITERAL, API_DNN}}, // 1e-5 + {"CUDNN_SEV_ERROR_EN", {"HIPDNN_SEV_ERROR_EN", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_SEV_WARNING_EN", {"HIPDNN_SEV_WARNING_EN", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_SEV_INFO_EN", {"HIPDNN_SEV_INFO_EN", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnContext", {"hipdnnContext", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, {"cudnnHandle_t", {"hipdnnHandle_t", CONV_TYPE, API_DNN}}, {"cudnnStatus_t", {"hipdnnStatus_t", CONV_TYPE, API_DNN}}, @@ -2956,6 +2968,15 @@ const std::map CUDA_IDENTIFIER_MAP{ {"CUDNN_DATA_INT8x4", {"HIPDNN_DATA_INT8x4", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 {"CUDNN_DATA_UINT8", {"HIPDNN_DATA_UINT8", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6 {"CUDNN_DATA_UINT8x4", {"HIPDNN_DATA_UINT8x4", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7 + {"cudnnErrQueryMode_t", {"hipdnnErrQueryMode_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_ERRQUERY_RAWCODE", {"HIPDNN_ERRQUERY_RAWCODE", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_ERRQUERY_NONBLOCKING", {"HIPDNN_ERRQUERY_NONBLOCKING", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_ERRQUERY_BLOCKING", {"HIPDNN_ERRQUERY_BLOCKING", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"cudnnSeverity_t", {"hipdnnSeverity_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_SEV_FATAL", {"HIPDNN_SEV_FATAL", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_SEV_ERROR", {"HIPDNN_SEV_ERROR", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"CUDNN_SEV_WARNING", {"HIPDNN_SEV_WARNING", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 2 + {"CUDNN_SEV_INFO", {"HIPDNN_SEV_INFO", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3 {"cudnnConvolutionFwdAlgo_t", {"hipdnnConvolutionFwdAlgo_t", CONV_TYPE, API_DNN}}, {"CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", {"HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 @@ -2967,34 +2988,328 @@ const std::map CUDA_IDENTIFIER_MAP{ {"CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", {"HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 {"CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", {"HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", CONV_NUMERIC_LITERAL, API_DNN}}, // 7 {"CUDNN_CONVOLUTION_FWD_ALGO_COUNT", {"HIPDNN_CONVOLUTION_FWD_ALGO_COUNT", CONV_NUMERIC_LITERAL, API_DNN}}, // 8 - {"cudnnConvolutionFwdPreference_t", {"hipdnnConvolutionFwdPreference_t", CONV_TYPE, API_DNN}}, {"CUDNN_CONVOLUTION_FWD_NO_WORKSPACE", {"HIPDNN_CONVOLUTION_FWD_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 {"CUDNN_CONVOLUTION_FWD_PREFER_FASTEST", {"HIPDNN_CONVOLUTION_FWD_PREFER_FASTEST", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 {"CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", {"HIPDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnDeterminism_t", {"hipdnnDeterminism_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_NON_DETERMINISTIC", {"HIPDNN_NON_DETERMINISTIC", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_DETERMINISTIC", {"HIPDNN_DETERMINISTIC", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 + {"cudnnDivNormMode_t", {"hipdnnDivNormMode_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_DIVNORM_PRECOMPUTED_MEANS", {"HIPDNN_DIVNORM_PRECOMPUTED_MEANS", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"cudnnCTCLossAlgo_t", {"hipdnnCTCLossAlgo_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_CTC_LOSS_ALGO_DETERMINISTIC", {"HIPDNN_CTC_LOSS_ALGO_DETERMINISTIC", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 + {"CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC", {"HIPDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 1 {"cudnnFilterDescriptor_t", {"hipdnnFilterDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnDropoutDescriptor_t", {"hipdnnDropoutDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnConvolutionFwdAlgoPerf_t", {"hipdnnConvolutionFwdAlgoPerf_t", CONV_TYPE, API_DNN}}, + {"cudnnConvolutionBwdFilterAlgoPerf_t", {"hipdnnConvolutionBwdFilterAlgoPerf_t", CONV_TYPE, API_DNN}}, + {"cudnnRNNDescriptor_t", {"hipdnnRNNDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnPersistentRNNPlan", {"hipdnnPersistentRNNPlan", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnPersistentRNNPlan_t", {"hipdnnPersistentRNNPlan_t", CONV_TYPE, API_DNN}}, + {"cudnnTensorStruct", {"hipdnnTensorStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnConvolutionStruct", {"hipdnnConvolutionStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnPoolingStruct", {"hipdnnPoolingStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnPoolingDescriptor_t", {"hipdnnPoolingDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnFilterStruct", {"hipdnnFilterStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnLRNDescriptor_t", {"hipdnnLRNDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnLRNStruct", {"hipdnnLRNStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnActivationDescriptor_t", {"hipdnnActivationDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnActivationStruct", {"hipdnnActivationStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTransformerDescriptor_t", {"hipdnnSpatialTransformerDescriptor_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTransformerStruct", {"hipdnnSpatialTransformerStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnOpTensorDescriptor_t", {"hipdnnOpTensorDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnOpTensorStruct", {"hipdnnOpTensorStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnReduceTensorDescriptor_t", {"hipdnnReduceTensorDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnReduceTensorStruct", {"hipdnnReduceTensorStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCTCLossDescriptor_t", {"hipdnnCTCLossDescriptor_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCTCLossStruct", {"hipdnnCTCLossStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnConvolutionBwdDataAlgoPerf_t", {"hipdnnConvolutionBwdDataAlgoPerf_t", CONV_TYPE, API_DNN}}, + {"cudnnAlgorithmDescriptor_t", {"hipdnnAlgorithmDescriptor_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnAlgorithmStruct", {"hipdnnAlgorithmStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnAlgorithmPerformance_t", {"hipdnnAlgorithmPerformance_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnAlgorithmPerformanceStruct", {"hipdnnAlgorithmPerformanceStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnRNNStruct", {"hipdnnRNNStruct", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnRNNDescriptor_t", {"hipdnnRNNDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnAlgorithm_t", {"hipdnnAlgorithm_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCallback_t", {"hipdnnCallback_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDebug_t", {"hipdnnDebug_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + + {"cudnnLRNMode_t", {"hipdnnLRNMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_LRN_CROSS_CHANNEL_DIM1", {"HIPDNN_LRN_CROSS_CHANNEL", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 vs 1 + {"cudnnRNNInputMode_t", {"hipdnnRNNInputMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_LINEAR_INPUT", {"HIPDNN_LINEAR_INPUT", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_SKIP_INPUT", {"HIPDNN_SKIP_INPUT", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnDirectionMode_t", {"hipdnnDirectionMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_UNIDIRECTIONAL", {"HIPDNN_UNIDIRECTIONAL", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_BIDIRECTIONAL", {"HIPDNN_BIDIRECTIONAL", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnMathType_t", {"hipdnnMathType_t", CONV_TYPE, API_DNN}}, + {"CUDNN_DEFAULT_MATH", {"HIPDNN_DEFAULT_MATH", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_TENSOR_OP_MATH", {"HIPDNN_TENSOR_OP_MATH", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnNanPropagation_t", {"hipdnnNanPropagation_t", CONV_TYPE, API_DNN}}, + {"CUDNN_NOT_PROPAGATE_NAN", {"HIPDNN_NOT_PROPAGATE_NAN", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_PROPAGATE_NAN", {"HIPDNN_PROPAGATE_NAN", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnConvolutionBwdDataAlgo_t", {"hipdnnConvolutionBwdDataAlgo_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_0", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_0", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_1", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_1", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT", {"HIPDNN_CONVOLUTION_BWD_DATA_ALGO_TRANSPOSE_GEMM", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 + {"cudnnConvolutionBwdFilterAlgo_t", {"hipdnnConvolutionBwdFilterAlgo_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_0", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_1", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_3", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 + {"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT", {"HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT", CONV_NUMERIC_LITERAL, API_DNN}}, // 7 + {"cudnnConvolutionBwdFilterPreference_t", {"hipdnnConvolutionBwdFilterPreference_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE", {"HIPDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST", {"HIPDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT",{"HIPDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT",CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnRNNAlgo_t", {"hipdnnRNNAlgo_t", CONV_TYPE, API_DNN}}, + {"CUDNN_RNN_ALGO_STANDARD", {"HIPDNN_RNN_ALGO_STANDARD", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_RNN_ALGO_PERSIST_STATIC", {"HIPDNN_RNN_ALGO_PERSIST_STATIC", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_RNN_ALGO_PERSIST_DYNAMIC", {"HIPDNN_RNN_ALGO_PERSIST_DYNAMIC", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_RNN_ALGO_COUNT", {"HIPDNN_RNN_ALGO_COUNT", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 3 + {"cudnnRNNMode_t", {"hipdnnRNNMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_RNN_RELU", {"HIPDNN_RNN_RELU", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_RNN_TANH", {"HIPDNN_RNN_TANH", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_LSTM", {"HIPDNN_LSTM", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_GRU", {"HIPDNN_GRU", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"cudnnOpTensorOp_t", {"hipdnnOpTensorOp_t", CONV_TYPE, API_DNN}}, + {"CUDNN_OP_TENSOR_ADD", {"HIPDNN_OP_TENSOR_ADD", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_OP_TENSOR_MUL", {"HIPDNN_OP_TENSOR_MUL", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_OP_TENSOR_MIN", {"HIPDNN_OP_TENSOR_MIN", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_OP_TENSOR_MAX", {"HIPDNN_OP_TENSOR_MAX", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_OP_TENSOR_SQRT", {"HIPDNN_OP_TENSOR_SQRT", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_OP_TENSOR_NOT", {"HIPDNN_OP_TENSOR_NOT", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 5 + {"cudnnReduceTensorOp_t", {"hipdnnReduceTensorOp_t", CONV_TYPE, API_DNN}}, + {"CUDNN_REDUCE_TENSOR_ADD", {"HIPDNN_REDUCE_TENSOR_ADD", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_REDUCE_TENSOR_MUL", {"HIPDNN_REDUCE_TENSOR_MUL", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_REDUCE_TENSOR_MIN", {"HIPDNN_REDUCE_TENSOR_MIN", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_REDUCE_TENSOR_MAX", {"HIPDNN_REDUCE_TENSOR_MAX", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_REDUCE_TENSOR_AMAX", {"HIPDNN_REDUCE_TENSOR_AMAX", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_REDUCE_TENSOR_AVG", {"HIPDNN_REDUCE_TENSOR_AVG", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_REDUCE_TENSOR_NORM1", {"HIPDNN_REDUCE_TENSOR_NORM1", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 + {"CUDNN_REDUCE_TENSOR_NORM2", {"HIPDNN_REDUCE_TENSOR_NORM2", CONV_NUMERIC_LITERAL, API_DNN}}, // 7 + {"CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS", {"HIPDNN_REDUCE_TENSOR_MUL_NO_ZEROS", CONV_NUMERIC_LITERAL, API_DNN}}, // 8 + {"cudnnReduceTensorIndices_t", {"hipdnnReduceTensorIndices_t", CONV_TYPE, API_DNN}}, + {"CUDNN_REDUCE_TENSOR_NO_INDICES", {"HIPDNN_REDUCE_TENSOR_NO_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_REDUCE_TENSOR_FLATTENED_INDICES", {"HIPDNN_REDUCE_TENSOR_FLATTENED_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnConvolutionBwdDataPreference_t", {"hipdnnConvolutionBwdDataPreference_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE", {"HIPDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST", {"HIPDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT", {"HIPDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnIndicesType_t", {"hipdnnIndicesType_t", CONV_TYPE, API_DNN}}, + {"CUDNN_32BIT_INDICES", {"HIPDNN_32BIT_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_64BIT_INDICES", {"HIPDNN_64BIT_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_16BIT_INDICES", {"HIPDNN_16BIT_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_8BIT_INDICES", {"HIPDNN_8BIT_INDICES", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"cudnnSoftmaxAlgorithm_t", {"hipdnnSoftmaxAlgorithm_t", CONV_TYPE, API_DNN}}, + {"CUDNN_SOFTMAX_FAST", {"HIPDNN_SOFTMAX_FAST", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_SOFTMAX_ACCURATE", {"HIPDNN_SOFTMAX_ACCURATE", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_SOFTMAX_LOG", {"HIPDNN_SOFTMAX_LOG", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnSoftmaxMode_t", {"hipdnnSoftmaxMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_SOFTMAX_MODE_INSTANCE", {"HIPDNN_SOFTMAX_MODE_INSTANCE", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_SOFTMAX_MODE_CHANNEL", {"HIPDNN_SOFTMAX_MODE_CHANNEL", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnPoolingMode_t", {"hipdnnPoolingMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_POOLING_MAX", {"HIPDNN_POOLING_MAX", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", {"HIPDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", {"HIPDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_POOLING_MAX_DETERMINISTIC", {"HIPDNN_POOLING_MAX_DETERMINISTIC", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"cudnnActivationMode_t", {"hipdnnActivationMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_ACTIVATION_SIGMOID", {"HIPDNN_ACTIVATION_SIGMOID", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_ACTIVATION_RELU", {"HIPDNN_ACTIVATION_RELU", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_ACTIVATION_TANH", {"HIPDNN_ACTIVATION_TANH", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_ACTIVATION_CLIPPED_RELU", {"HIPDNN_ACTIVATION_CLIPPED_RELU", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_ACTIVATION_ELU", {"HIPDNN_ACTIVATION_ELU", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_ACTIVATION_IDENTITY", {"HIPDNN_ACTIVATION_PATHTRU", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"cudnnBatchNormMode_t", {"hipdnnBatchNormMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_BATCHNORM_PER_ACTIVATION", {"HIPDNN_BATCHNORM_PER_ACTIVATION", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_BATCHNORM_SPATIAL", {"HIPDNN_BATCHNORM_SPATIAL", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_BATCHNORM_SPATIAL_PERSISTENT", {"HIPDNN_BATCHNORM_SPATIAL_PERSISTENT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnSamplerType_t", {"hipdnnSamplerType_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"CUDNN_SAMPLER_BILINEAR", {"HIPDNN_SAMPLER_BILINEAR", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 0 {"cudnnGetVersion", {"hipdnnGetVersion", CONV_VERSION, API_DNN}}, {"cudnnGetCudartVersion", {"hipdnnGetCudartVersion", CONV_VERSION, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnQueryRuntimeError", {"hipdnnQueryRuntimeError", CONV_VERSION, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetProperty", {"hipdnnGetProperty", CONV_VERSION, API_DNN, HIP_UNSUPPORTED}}, {"cudnnGetErrorString", {"hipdnnGetErrorString", CONV_ERROR, API_DNN}}, + {"cudnnCreate", {"hipdnnCreate", CONV_MATH_FUNC, API_DNN}}, {"cudnnCreateTensorDescriptor", {"hipdnnCreateTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateDropoutDescriptor", {"hipdnnCreateDropoutDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateReduceTensorDescriptor", {"hipdnnCreateReduceTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetReduceTensorDescriptor", {"hipdnnSetReduceTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetReduceTensorDescriptor", {"hipdnnGetReduceTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetReductionIndicesSize", {"hipdnnGetReductionIndicesSize", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetReductionWorkspaceSize", {"hipdnnGetReductionWorkspaceSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateOpTensorDescriptor", {"hipdnnCreateOpTensorDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetOpTensorDescriptor", {"hipdnnSetOpTensorDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetOpTensorDescriptor", {"hipdnnGetOpTensorDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateRNNDescriptor", {"hipdnnCreateRNNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetStream", {"hipdnnSetStream", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetStream", {"hipdnnGetStream", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetRNNDescriptor_v5", {"hipdnnSetRNNDescriptor_v5", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetRNNDescriptor_v6", {"hipdnnSetRNNDescriptor_v6", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetRNNDescriptor", {"hipdnnSetRNNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDropoutGetStatesSize", {"hipdnnDropoutGetStatesSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDropoutGetReserveSpaceSize", {"hipdnnDropoutGetReserveSpaceSize", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnTransformTensor", {"hipdnnTransformTensor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnSetTensor4dDescriptor", {"hipdnnSetTensor4dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetTensor4dDescriptor", {"hipdnnGetTensor4dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnAddTensor", {"hipdnnAddTensor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnOpTensor", {"hipdnnOpTensor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetTensorSizeInBytes", {"hipdnnGetTensorSizeInBytes", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetTensor4dDescriptorEx", {"hipdnnSetTensor4dDescriptorEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetTensorNdDescriptor", {"hipdnnSetTensorNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetTensorNdDescriptor", {"hipdnnGetTensorNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetTensorNdDescriptorEx", {"hipdnnSetTensorNdDescriptorEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFindConvolutionForwardAlgorithm", {"hipdnnFindConvolutionForwardAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnFindConvolutionForwardAlgorithmEx", {"hipdnnFindConvolutionForwardAlgorithmEx", CONV_MATH_FUNC, API_DNN}}, + {"cudnnConvolutionBackwardFilter", {"hipdnnConvolutionBackwardFilter", CONV_MATH_FUNC, API_DNN}}, + {"cudnnConvolutionBackwardData", {"hipdnnConvolutionBackwardData", CONV_MATH_FUNC, API_DNN}}, + {"cudnnFindConvolutionBackwardFilterAlgorithm", {"hipdnnFindConvolutionBackwardFilterAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnFindConvolutionBackwardFilterAlgorithmEx", {"hipdnnFindConvolutionBackwardFilterAlgorithmEx", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionBackwardFilterAlgorithm", {"hipdnnGetConvolutionBackwardFilterAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionBackwardFilterAlgorithm_v7", {"hipdnnGetConvolutionBackwardFilterAlgorithm_v7", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionBackwardFilterWorkspaceSize",{"hipdnnGetConvolutionBackwardFilterWorkspaceSize",CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionBackwardDataWorkspaceSize", {"hipdnnGetConvolutionBackwardDataWorkspaceSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionBackwardDataAlgorithm", {"hipdnnGetConvolutionBackwardDataAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionBackwardDataAlgorithm_v7", {"hipdnnGetConvolutionBackwardDataAlgorithm_v7", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionBackwardDataAlgorithmMaxCount", {"hipdnnGetConvolutionBackwardDataAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionForwardAlgorithmMaxCount", {"hipdnnGetConvolutionForwardAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNLinLayerMatrixParams", {"hipdnnGetRNNLinLayerMatrixParams", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetRNNLinLayerBiasParams", {"hipdnnGetRNNLinLayerBiasParams", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetRNNProjectionLayers", {"hipdnnSetRNNProjectionLayers", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNProjectionLayers", {"hipdnnGetRNNProjectionLayers", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetRNNAlgorithmDescriptor", {"hipdnnSetRNNAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNDescriptor", {"hipdnnGetRNNDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetFilterNdDescriptor", {"hipdnnGetFilterNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnFindConvolutionBackwardDataAlgorithm", {"hipdnnFindConvolutionBackwardDataAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnFindConvolutionBackwardDataAlgorithmEx", {"hipdnnFindConvolutionBackwardDataAlgorithmEx", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetDropoutDescriptor", {"hipdnnSetDropoutDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnRestoreDropoutDescriptor", {"hipdnnRestoreDropoutDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetDropoutDescriptor", {"hipdnnGetDropoutDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetDropoutDescriptor", {"hipdnnGetDropoutDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnSetConvolution2dDescriptor", {"hipdnnSetConvolution2dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolution2dDescriptor", {"hipdnnGetConvolution2dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetConvolutionMathType", {"hipdnnSetConvolutionMathType", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDropoutForward", {"hipdnnDropoutForward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDropoutBackward", {"hipdnnDropoutBackward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionMathType", {"hipdnnGetConvolutionMathType", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetConvolutionGroupCount", {"hipdnnSetConvolutionGroupCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionGroupCount", {"hipdnnGetConvolutionGroupCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnGetConvolution2dForwardOutputDim", {"hipdnnGetConvolution2dForwardOutputDim", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetConvolutionNdDescriptor", {"hipdnnSetConvolutionNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionNdDescriptor", {"hipdnnGetConvolutionNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionNdForwardOutputDim", {"hipdnnGetConvolutionNdForwardOutputDim", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnCreateFilterDescriptor", {"hipdnnCreateFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreatePersistentRNNPlan", {"hipdnnCreatePersistentRNNPlan", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetPersistentRNNPlan", {"hipdnnSetPersistentRNNPlan", CONV_MATH_FUNC, API_DNN}}, + {"cudnnRNNForwardInference", {"hipdnnRNNForwardInference", CONV_MATH_FUNC, API_DNN}}, + {"cudnnRNNBackwardWeights", {"hipdnnRNNBackwardWeights", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetRNNParamsSize", {"hipdnnGetRNNParamsSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetRNNWorkspaceSize", {"hipdnnGetRNNWorkspaceSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetRNNTrainingReserveSize", {"hipdnnGetRNNTrainingReserveSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetFilterNdDescriptor", {"hipdnnSetFilterNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnRNNForwardTraining", {"hipdnnRNNForwardTraining", CONV_MATH_FUNC, API_DNN}}, + {"cudnnRNNBackwardData", {"hipdnnRNNBackwardData", CONV_MATH_FUNC, API_DNN}}, {"cudnnSetFilter4dDescriptor", {"hipdnnSetFilter4dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetRNNMatrixMathType", {"hipdnnSetRNNMatrixMathType", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetRNNMatrixMathType", {"hipdnnGetRNNMatrixMathType", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNForwardInferenceAlgorithmMaxCount", {"hipdnnGetRNNForwardInferenceAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFindRNNForwardInferenceAlgorithmEx", {"hipdnnFindRNNForwardInferenceAlgorithmEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNForwardTrainingAlgorithmMaxCount", {"hipdnnGetRNNForwardTrainingAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFindRNNForwardTrainingAlgorithmEx", {"hipdnnFindRNNForwardTrainingAlgorithmEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNBackwardDataAlgorithmMaxCount", {"hipdnnGetRNNBackwardDataAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFindRNNBackwardDataAlgorithmEx", {"hipdnnFindRNNBackwardDataAlgorithmEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetRNNBackwardWeightsAlgorithmMaxCount", {"hipdnnGetRNNBackwardWeightsAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnFindRNNBackwardWeightsAlgorithmEx", {"hipdnnFindRNNBackwardWeightsAlgorithmEx", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnCreateConvolutionDescriptor", {"hipdnnCreateConvolutionDescriptor", CONV_MATH_FUNC, API_DNN}}, {"cudnnGetConvolutionForwardAlgorithm", {"hipdnnGetConvolutionForwardAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionForwardAlgorithm_v7", {"hipdnnGetConvolutionForwardAlgorithm_v7", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnConvolutionForward", {"hipdnnConvolutionForward", CONV_MATH_FUNC, API_DNN}}, {"cudnnGetConvolutionForwardWorkspaceSize", {"hipdnnGetConvolutionForwardWorkspaceSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnConvolutionBiasActivationForward", {"hipdnnConvolutionBiasActivationForward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount", {"hipdnnGetConvolutionBackwardFilterAlgorithmMaxCount", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnConvolutionBackwardBias", {"hipdnnConvolutionBackwardBias", CONV_MATH_FUNC, API_DNN}}, + {"cudnnReduceTensor", {"hipdnnReduceTensor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetTensor", {"hipdnnSetTensor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnScaleTensor", {"hipdnnScaleTensor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnIm2Col", {"hipdnnIm2Col", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, {"cudnnDestroyTensorDescriptor", {"hipdnnDestroyTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyOpTensorDescriptor", {"hipdnnDestroyOpTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, {"cudnnDestroyConvolutionDescriptor", {"hipdnnDestroyConvolutionDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyDropoutDescriptor", {"hipdnnDestroyDropoutDescriptor", CONV_MATH_FUNC, API_DNN}}, {"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, - {"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyRNNDescriptor", {"hipdnnDestroyRNNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyReduceTensorDescriptor", {"hipdnnDestroyReduceTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyPersistentRNNPlan", {"hipdnnDestroyPersistentRNNPlan", CONV_MATH_FUNC, API_DNN}}, {"cudnnDestroy", {"hipdnnDestroy", CONV_MATH_FUNC, API_DNN}}, - + {"cudnnSoftmaxForward", {"hipdnnSoftmaxForward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSoftmaxBackward", {"hipdnnSoftmaxBackward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreatePoolingDescriptor", {"hipdnnCreatePoolingDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetPooling2dDescriptor", {"hipdnnSetPooling2dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetPooling2dDescriptor", {"hipdnnGetPooling2dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetPoolingNdDescriptor", {"hipdnnSetPoolingNdDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetPoolingNdDescriptor", {"hipdnnGetPoolingNdDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetPoolingNdForwardOutputDim", {"hipdnnGetPoolingNdForwardOutputDim", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetPooling2dForwardOutputDim", {"hipdnnGetPooling2dForwardOutputDim", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyPoolingDescriptor", {"hipdnnDestroyPoolingDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnPoolingForward", {"hipdnnPoolingForward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnPoolingBackward", {"hipdnnPoolingBackward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateActivationDescriptor", {"hipdnnCreateActivationDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetActivationDescriptor", {"hipdnnSetActivationDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetActivationDescriptor", {"hipdnnGetActivationDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyActivationDescriptor", {"hipdnnDestroyActivationDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnActivationForward", {"hipdnnActivationForward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnActivationBackward", {"hipdnnActivationBackward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateLRNDescriptor", {"hipdnnCreateLRNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetLRNDescriptor", {"hipdnnSetLRNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetLRNDescriptor", {"hipdnnGetLRNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyLRNDescriptor", {"hipdnnDestroyLRNDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnLRNCrossChannelForward", {"hipdnnLRNCrossChannelForward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnLRNCrossChannelBackward", {"hipdnnLRNCrossChannelBackward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDivisiveNormalizationForward", {"hipdnnDivisiveNormalizationForward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDivisiveNormalizationBackward", {"hipdnnDivisiveNormalizationBackward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDeriveBNTensorDescriptor", {"hipdnnDeriveBNTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnBatchNormalizationForwardTraining", {"hipdnnBatchNormalizationForwardTraining", CONV_MATH_FUNC, API_DNN}}, + {"cudnnBatchNormalizationForwardInference", {"hipdnnBatchNormalizationForwardInference", CONV_MATH_FUNC, API_DNN}}, + {"cudnnBatchNormalizationBackward", {"hipdnnBatchNormalizationBackward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateSpatialTransformerDescriptor", {"hipdnnCreateSpatialTransformerDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetSpatialTransformerNdDescriptor", {"hipdnnSetSpatialTransformerNdDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroySpatialTransformerDescriptor", {"hipdnnDestroySpatialTransformerDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTfGridGeneratorForward", {"hipdnnSpatialTfGridGeneratorForward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTfGridGeneratorBackward", {"hipdnnSpatialTfGridGeneratorBackward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTfSamplerForward", {"hipdnnSpatialTfSamplerForward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSpatialTfSamplerBackward", {"hipdnnSpatialTfSamplerBackward", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateCTCLossDescriptor", {"hipdnnCreateCTCLossDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetCTCLossDescriptor", {"hipdnnSetCTCLossDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetCTCLossDescriptor", {"hipdnnGetCTCLossDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyCTCLossDescriptor", {"hipdnnDestroyCTCLossDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCTCLoss", {"hipdnnCTCLoss", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetCTCLossWorkspaceSize", {"hipdnnGetCTCLossWorkspaceSize", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateAlgorithmDescriptor", {"hipdnnCreateAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetAlgorithmDescriptor", {"hipdnnSetAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetAlgorithmDescriptor", {"hipdnnGetAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCopyAlgorithmDescriptor", {"hipdnnCopyAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyAlgorithmDescriptor", {"hipdnnDestroyAlgorithmDescriptor", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnCreateAlgorithmPerformance", {"hipdnnCreateAlgorithmPerformance", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetAlgorithmPerformance", {"hipdnnSetAlgorithmPerformance", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetAlgorithmPerformance", {"hipdnnGetAlgorithmPerformance", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnDestroyAlgorithmPerformance", {"hipdnnDestroyAlgorithmPerformance", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetAlgorithmSpaceSize", {"hipdnnGetAlgorithmSpaceSize", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSaveAlgorithm", {"hipdnnSaveAlgorithm", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnRestoreAlgorithm", {"hipdnnRestoreAlgorithm", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnSetCallback", {"hipdnnSetCallback", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetCallback", {"hipdnnGetCallback", CONV_MATH_FUNC, API_DNN, HIP_UNSUPPORTED}}, }; const std::map& CUDA_RENAMES_MAP() { diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index ca236a1125..843e80e8b4 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -88,6 +88,11 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns return __hip_hc_ir_usad_int(x, y, z); } +extern __device__ __attribute__((const)) unsigned int __mbcnt_lo(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.lo"); +extern __device__ __attribute__((const)) unsigned int __mbcnt_hi(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.hi"); + +__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); } + /* Rounding modes are not yet supported in HIP */ diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 92f06e9174..1a6b0f7dda 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -29,6 +29,20 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H +#if defined(__HCC__) +#define __HCC_OR_HIP_CLANG__ 1 +#define __HCC_ONLY__ 1 +#define __HIP_CLANG_ONLY__ 0 +#elif defined(__clang__) && defined(__HIP__) +#define __HCC_OR_HIP_CLANG__ 1 +#define __HCC_ONLY__ 0 +#define __HIP_CLANG_ONLY__ 1 +#else +#define __HCC_OR_HIP_CLANG__ 0 +#define __HCC_ONLY__ 0 +#define __HIP_CLANG_ONLY__ 0 +#endif + //--- // Top part of file can be compiled with any compiler @@ -41,15 +55,16 @@ THE SOFTWARE. #include #endif //__cplusplus -#if __HCC__ +#if __HCC_OR_HIP_CLANG__ // Define NVCC_COMPAT for CUDA compatibility #define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess #include +#endif // __HCC_OR_HIP_CLANG__ - +#if __HCC__ // define HIP_ENABLE_PRINTF to enable printf #ifdef HIP_ENABLE_PRINTF #define HCC_ENABLE_ACCELERATOR_PRINTF 1 @@ -164,6 +179,10 @@ extern int HIP_TRACE_API; #define __HCC_C__ #endif +#endif // defined __HCC__ + +#if __HCC_OR_HIP_CLANG__ + // TODO - hipify-clang - change to use the function call. //#define warpSize hc::__wavesize() static constexpr int warpSize = 64; @@ -371,6 +390,10 @@ __device__ void __threadfence_system(void); * @} */ +#endif // __HCC_OR_HIP_CLANG__ + +#if defined __HCC__ + template < typename std::common_type::type f> @@ -414,6 +437,8 @@ static constexpr Coordinates threadIdx; #define hipGridDim_y (hc_get_num_groups(1)) #define hipGridDim_z (hc_get_num_groups(2)) +#endif // defined __HCC__ +#if __HCC_OR_HIP_CLANG__ extern "C" __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size); extern "C" __device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size); extern "C" __device__ void* __hip_hc_malloc(size_t); @@ -446,7 +471,9 @@ static inline __device__ void printf(const char* format, All... all) {} #endif #endif +#endif //__HCC_OR_HIP_CLANG__ +#ifdef __HCC__ #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) @@ -514,7 +541,9 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri * @} */ - +// +// hip-clang functions +// #elif defined(__clang__) && defined(__HIP__) #define HIP_KERNEL_NAME(...) __VA_ARGS__ @@ -612,6 +641,27 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +#pragma push_macro("__DEVICE__") +#define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \ + __attribute__((weak)) + +__DEVICE__ void __device_trap() __asm("llvm.trap"); + +__DEVICE__ void inline __assert_fail(const char * __assertion, + const char *__file, + unsigned int __line, + const char *__function) +{ + // Ignore all the args for now. + __device_trap(); +} + +extern "C" __device__ __attribute__((noduplicate)) void __syncthreads(); + +#pragma push_macro("__DEVICE__") + +#include + #endif #endif // HIP_HCC_DETAIL_RUNTIME_H diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 8becab3c9c..0a80a583c7 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -94,6 +94,19 @@ typedef struct ihipModule_t* hipModule_t; typedef struct ihipModuleSymbol_t* hipFunction_t; +struct hipFuncAttributes { + int binaryVersion; + int cacheModeCA; + size_t constSizeBytes; + size_t localSizeBytes; + int maxDynamicSharedSizeBytes; + int maxThreadsPerBlock; + int numRegs; + int preferredShmemCarveout; + int ptxVersion; + size_t sharedSizeBytes; +}; + typedef struct ihipEvent_t* hipEvent_t; enum hipLimit_t { @@ -1450,6 +1463,27 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height,hipStream_t stream __dparm(0)); +/** + * @brief Fills synchronously the memory area pointed to by pitchedDevPtr with the constant value. + * + * @param[in] pitchedDevPtr + * @param[in] value - constant value to be set + * @param[in] extent + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree + */ +hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ); + +/** + * @brief Fills asynchronously the memory area pointed to by pitchedDevPtr with the constant value. + * + * @param[in] pitchedDevPtr + * @param[in] value - constant value to be set + * @param[in] extent + * @param[in] stream + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree + */ +hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ,hipStream_t stream __dparm(0)); + /** * @brief Query memory info. * Return snapshot of free memory, and total allocatable memory on the device. @@ -2222,6 +2256,17 @@ hipError_t hipModuleUnload(hipModule_t module); */ hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname); +/** + * @bried Find out attributes for a given function. + * + * @param [out] attr + * @param [in] func + * + * @returns hipSuccess, hipErrorInvalidDeviceFunction + */ + +hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func); + /** * @brief returns device memory pointer and size of the kernel present in the module with symbol @p * name diff --git a/include/hip/hcc_detail/math_functions.h b/include/hip/hcc_detail/math_functions.h index c11112b1dc..49650987ac 100644 --- a/include/hip/hcc_detail/math_functions.h +++ b/include/hip/hcc_detail/math_functions.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include +#include #include __device__ @@ -101,6 +102,9 @@ uint64_t __make_mantissa(const char* tagp) // BEGIN FLOAT __device__ inline +float abs(float x) { return __ocml_fabs_f32(x); } +__device__ +inline float acosf(float x) { return __ocml_acos_f32(x); } __device__ inline @@ -628,6 +632,9 @@ float __tanf(float x) { return __ocml_tan_f32(x); } // BEGIN DOUBLE __device__ inline +double abs(double x) { return __ocml_fabs_f64(x); } +__device__ +inline double acos(double x) { return __ocml_acos_f64(x); } __device__ inline @@ -1101,4 +1108,37 @@ double __fma_rz(double x, double y, double z) return __llvm_fma_rtz_f64(x, y, z); } // END INTRINSICS -// END DOUBLE \ No newline at end of file +// END DOUBLE + +// BEGIN INTEGER +__device__ +inline +int abs(int x) +{ + int sgn = x >> (sizeof(int) * CHAR_BIT - 1); + return (x ^ sgn) - sgn; +} +__device__ +inline +long labs(long x) +{ + long sgn = x >> (sizeof(long) * CHAR_BIT - 1); + return (x ^ sgn) - sgn; +} +__device__ +inline +long long llabs(long long x) +{ + long long sgn = x >> (sizeof(long long) * CHAR_BIT - 1); + return (x ^ sgn) - sgn; +} + +#if defined(__cplusplus) + __device__ + inline + long abs(long x) { return labs(x); } + __device__ + inline + long long abs(long long x) { return llabs(x); } +#endif +// END INTEGER diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index b6ccafb205..ac689fdb89 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -22,8 +22,10 @@ THE SOFTWARE. #pragma once +#include #include #include +#include #include #include @@ -46,11 +48,45 @@ struct hash { inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) { return x.handle == y.handle; } namespace hip_impl { -struct Kernel_descriptor { - std::uint64_t kernel_object_; - std::uint32_t group_size_; - std::uint32_t private_size_; - std::string name_; +class Kernel_descriptor { + std::uint64_t kernel_object_{}; + amd_kernel_code_t const* kernel_header_{nullptr}; + std::string name_{}; +public: + Kernel_descriptor() = default; + Kernel_descriptor(std::uint64_t kernel_object, const std::string& name) + : kernel_object_{kernel_object}, name_{name} + { + bool supported{false}; + std::uint16_t min_v{UINT16_MAX}; + auto r = hsa_system_major_extension_supported( + HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported); + + if (r != HSA_STATUS_SUCCESS || !supported) return; + + hsa_ven_amd_loader_1_01_pfn_t tbl{}; + + r = hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(tbl), + reinterpret_cast(&tbl)); + + if (r != HSA_STATUS_SUCCESS) return; + if (!tbl.hsa_ven_amd_loader_query_host_address) return; + + r = tbl.hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object_), + reinterpret_cast(&kernel_header_)); + + if (r != HSA_STATUS_SUCCESS) return; + } + Kernel_descriptor(const Kernel_descriptor&) = default; + Kernel_descriptor(Kernel_descriptor&&) = default; + ~Kernel_descriptor() = default; + + Kernel_descriptor& operator=(const Kernel_descriptor&) = default; + Kernel_descriptor& operator=(Kernel_descriptor&&) = default; operator hipFunction_t() const { // TODO: this is awful and only meant for illustration. return reinterpret_cast(const_cast(this)); diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index bfde2e942d..2b36f3e140 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -243,6 +243,8 @@ typedef enum __HIP_NODISCARD hipError_t { 1062, ///< Produced when trying to unlock a non-page-locked memory. hipErrorMapBufferObjectFailed = 1071, ///< Produced when the IPC memory attach failed from ROCr. + hipErrorAssert = + 1081, ///< Produced when the kernel calls assert. hipErrorTbd ///< Marker that more error codes are needed. } hipError_t; diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 6f222648a2..fbff263295 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -150,16 +150,20 @@ typedef CUfunction hipFunction_t; typedef CUdeviceptr hipDeviceptr_t; typedef struct cudaArray hipArray; typedef struct cudaArray* hipArray_const_t; +typedef cudaFuncAttributes hipFuncAttributes; #define hipMemcpy3DParms cudaMemcpy3DParms #define hipArrayDefault cudaArrayDefault typedef cudaTextureObject_t hipTextureObject_t; typedef cudaSurfaceObject_t hipSurfaceObject_t; +#define hipTextureType1D cudaTextureType1D +#define hipTextureType1DLayered cudaTextureType1DLayered #define hipTextureType2D cudaTextureType2D #define hipTextureType3D cudaTextureType3D #define hipDeviceMapHost cudaDeviceMapHost #define hipExtent cudaExtent +#define hipPitchedPtr cudaPitchedPtr #define make_hipExtent make_cudaExtent #define make_hipPos make_cudaPos #define make_hipPitchedPtr make_cudaPitchedPtr @@ -381,6 +385,10 @@ inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height)); } +inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) { + return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent)); +} + inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); } inline static hipError_t hipMallocHost(void** ptr, size_t size) @@ -649,6 +657,14 @@ inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, si return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream)); } +inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ){ + return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent)); +} + +inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream __dparm(0) ){ + return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream)); +} + inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) { struct cudaDeviceProp cdprop; cudaError_t cerror; @@ -1092,6 +1108,10 @@ inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule return hipCUResultTohipError(cuModuleGetFunction(function, module, kname)); } +inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) { + return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func)); +} + inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod, const char* name) { return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name)); @@ -1148,8 +1168,8 @@ inline static hipError_t hipBindTexture(size_t* offset, const struct texture -inline static hipError_t hipBindTexture(size_t* offset, struct texture* tex, - const void* devPtr, const struct hipChannelFormatDesc* desc, +inline static hipError_t hipBindTexture(size_t* offset, struct texture& tex, + const void* devPtr, const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) { return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size)); } @@ -1159,6 +1179,11 @@ inline static hipError_t hipUnbindTexture(struct texture* tex) return hipCUDAErrorTohipError(cudaUnbindTexture(tex)); } +inline static hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr, + const hipChannelFormatDesc* desc, size_t size = UINT_MAX){ + return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size)); +} + template inline static hipError_t hipBindTextureToArray(struct texture& tex, hipArray_const_t array, diff --git a/lpl_ca/ca.hpp b/lpl_ca/ca.hpp index bb1963bede..0ef8458c20 100644 --- a/lpl_ca/ca.hpp +++ b/lpl_ca/ca.hpp @@ -23,7 +23,7 @@ inline clara::Parser cmdline_parser(bool& help, std::vector& inputs "https://reviews.llvm.org/D13909; " "the code object format is documented at: " "https://www.llvm.org/docs/AMDGPUUsage.html#code-object.") | - clara::Opt{targets, "gfx803,gfx900 etc."}["-t"]["--targets"]( + clara::Opt{targets, "gfx803,gfx900,gfx906 etc."}["-t"]["--targets"]( "targets for which code objects are to be extracted from " "the fat binary; must be included in the set of processors " "with ROCm support from " @@ -76,4 +76,4 @@ inline void validate_inputs(const std::vector& inputs) { throw std::runtime_error{"Non existent file " + *it + " passed as input."}; } } -} // namespace hip_impl \ No newline at end of file +} // namespace hip_impl diff --git a/lpl_ca/common.hpp b/lpl_ca/common.hpp index 74f4db9221..7652f08c46 100644 --- a/lpl_ca/common.hpp +++ b/lpl_ca/common.hpp @@ -12,7 +12,7 @@ namespace hip_impl { inline const std::unordered_set& amdgpu_targets() { // The evolving list lives at: // https://www.llvm.org/docs/AMDGPUUsage.html#processors. static const std::unordered_set r{"gfx701", "gfx801", "gfx802", "gfx803", - "gfx900"}; + "gfx900", "gfx906"}; return r; } @@ -77,4 +77,4 @@ inline void validate_targets(const std::vector& x) { } } } -} // Namespace hip_impl. \ No newline at end of file +} // Namespace hip_impl. diff --git a/lpl_ca/lpl.hpp b/lpl_ca/lpl.hpp index c9346fd793..cbd7fe8386 100644 --- a/lpl_ca/lpl.hpp +++ b/lpl_ca/lpl.hpp @@ -132,9 +132,9 @@ inline clara::Parser cmdline_parser(bool& help, std::vector& source "file is documented at: https://reviews.llvm.org/D13909.") | clara::Arg{sources, "a.cpp b.cpp etc."}("inputs for compilation; must contain valid C++ code.") | - clara::Opt{targets, "gfx803,gfx900 etc."}["-t"]["--targets"]( + clara::Opt{targets, "gfx803,gfx900,gfx906 etc."}["-t"]["--targets"]( "targets for AMDGPU lowering; must be included in the set " "of processors with ROCm support from " "https://www.llvm.org/docs/AMDGPUUsage.html#processors."); } -} // namespace hip_impl \ No newline at end of file +} // namespace hip_impl diff --git a/samples/1_Utils/hipCommander/Makefile b/samples/1_Utils/hipCommander/Makefile index a411763b7f..c3a19073f0 100644 --- a/samples/1_Utils/hipCommander/Makefile +++ b/samples/1_Utils/hipCommander/Makefile @@ -19,7 +19,7 @@ $(EXE): hipCommander.cpp $(HIPCC) $(CXXFLAGS) $^ -o $@ nullkernel.hsaco : nullkernel.hip.cpp - $(HIPCC) --genco nullkernel.hip -o nullkernel.hsaco + $(HIPCC) --genco nullkernel.hip.cpp -o nullkernel.hsaco install: $(EXE) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index 1af66f97e5..8ef19bab3f 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -278,21 +278,6 @@ struct uchar2Holder { }; } __attribute__((aligned(8))); -struct intHolder { - union { - signed int si[2]; - signed int long sl; - }; -} __attribute__((aligned(8))); - -struct uintHolder { - union { - signed int ui[2]; - signed int long ul; - }; -} __attribute__((aligned(8))); - - __device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) { struct uchar2Holder cHoldVal; struct ucharHolder cHoldKey; @@ -308,21 +293,29 @@ __device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int } __device__ long long __mul64hi(long long int x, long long int y) { - struct intHolder iHold1; - struct intHolder iHold2; - iHold1.sl = x; - iHold2.sl = y; - iHold1.sl = iHold1.si[1] * iHold2.si[1]; - return iHold1.sl; + ulong x0 = (ulong)x & 0xffffffffUL; + long x1 = x >> 32; + ulong y0 = (ulong)y & 0xffffffffUL; + long y1 = y >> 32; + ulong z0 = x0*y0; + long t = x1*y0 + (z0 >> 32); + long z1 = t & 0xffffffffL; + long z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); } __device__ unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) { - struct uintHolder uHold1; - struct uintHolder uHold2; - uHold1.ul = x; - uHold2.ul = y; - uHold1.ul = uHold1.ui[1] * uHold2.ui[1]; - return uHold1.ul; + ulong x0 = x & 0xffffffffUL; + ulong x1 = x >> 32; + ulong y0 = y & 0xffffffffUL; + ulong y1 = y >> 32; + ulong z0 = x0*y0; + ulong t = x1*y0 + (z0 >> 32); + ulong z1 = t & 0xffffffffUL; + ulong z2 = t >> 32; + z1 = x0*y1 + z1; + return x1*y1 + z2 + (z1 >> 32); } /* diff --git a/src/functional_grid_launch.inl b/src/functional_grid_launch.inl index b555967ebc..9ecad51476 100644 --- a/src/functional_grid_launch.inl +++ b/src/functional_grid_launch.inl @@ -107,8 +107,8 @@ namespace hip_impl it0->second.cbegin(), it0->second.cend(), [=](const pair& x) { - return x.first.handle == agent.handle; - }); + return x.first == agent; + }); if (it1 == it0->second.cend()) { throw runtime_error{ diff --git a/src/hip_clang.cpp b/src/hip_clang.cpp index 80b6111fc2..f7573e2819 100644 --- a/src/hip_clang.cpp +++ b/src/hip_clang.cpp @@ -27,10 +27,10 @@ THE SOFTWARE. #include "hip_hcc_internal.h" #include "trace_helper.h" -constexpr unsigned __cudaFatMAGIC2 = 0x466243b1; +constexpr unsigned __hipFatMAGIC2 = 0x48495046; // "HIPF" #define CLANG_OFFLOAD_BUNDLER_MAGIC "__CLANG_OFFLOAD_BUNDLE__" -#define AMDGCN_AMDHSA_TRIPLE "openmp-amdgcn--amdhsa" +#define AMDGCN_AMDHSA_TRIPLE "hip-amdgcn-amd-amdhsa" struct __ClangOffloadBundleDesc { uint64_t offset; @@ -59,7 +59,7 @@ __hipRegisterFatBinary(const void* data) HIP_INIT(); const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast(data); - if (fbwrapper->magic != __cudaFatMAGIC2 || fbwrapper->version != 1) { + if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) { return nullptr; } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 48a8da81ca..9b6758ddf4 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1132,15 +1132,19 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind); + hipError_t e = hipSuccess; + + // Return success if number of bytes to copy is 0 + if (sizeBytes == 0) return ihipLogStatus(e); + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); hc::completion_future marker; - hipError_t e = hipSuccess; if(dst==NULL || src==NULL) { e=hipErrorInvalidValue; - return e; + return ihipLogStatus(e); } try { stream->locked_copySync(dst, src, sizeBytes, kind); @@ -1208,7 +1212,6 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte return ihipLogStatus(e); } - hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) { HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes); @@ -1227,7 +1230,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) { return ihipLogStatus(e); } - hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind, stream); @@ -1235,7 +1237,6 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, kind, stream)); } - hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream) { HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream); @@ -1258,65 +1259,6 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToHost, stream)); } -// TODO - review and optimize -hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind) { - if (width > dpitch || width > spitch) return hipErrorUnknown; - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hc::completion_future marker; - - hipError_t e = hipSuccess; - - try { - for (int i = 0; i < height; ++i) { - stream->locked_copySync((unsigned char*)dst + i * dpitch, - (unsigned char*)src + i * spitch, width, kind); - } - } catch (ihipException& ex) { - e = ex._code; - } - - return e; -} - -hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind); - hipError_t e = hipSuccess; - e = ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind); - return ihipLogStatus(e); -} - -hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), pCopy); - hipError_t e = hipSuccess; - if (pCopy == nullptr) { - e = hipErrorInvalidValue; - } - e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch, - pCopy->widthInBytes, pCopy->height, hipMemcpyDefault); - return ihipLogStatus(e); -} - -hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); - if (width > dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); - hipError_t e = hipSuccess; - try { - for (int i = 0; i < height; ++i) { - e = hip_internal::memcpyAsync((unsigned char*)dst + i * dpitch, - (unsigned char*)src + i * spitch, width, kind, stream); - } - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind); @@ -1547,6 +1489,24 @@ inline const T& clamp_integer(const T& x, const T& lower, const T& upper) { return std::min(upper, std::max(x, lower)); } + +template +__global__ void hip_copy2d_n(T* dst, const T* src, size_t width, size_t height, size_t destPitch, size_t srcPitch) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t idy = blockIdx.y * blockDim.y + threadIdx.y; + size_t floorWidth = (width/sizeof(T)); + T *dstPtr = (T *)((uint8_t*) dst + idy * destPitch); + T *srcPtr = (T *)((uint8_t*) src + idy * srcPitch); + if((idx < floorWidth) && (idy < height)){ + dstPtr[idx] = srcPtr[idx]; + } else if((idx < width) && (idy < height)){ + size_t bytesToCopy = width - (floorWidth * sizeof(T)); + dstPtr += floorWidth; + srcPtr += floorWidth; + __builtin_memcpy(reinterpret_cast(dstPtr), reinterpret_cast(srcPtr),bytesToCopy); + } +} } // namespace template @@ -1559,6 +1519,16 @@ void ihipMemsetKernel(hipStream_t stream, T* ptr, T val, size_t sizeBytes) { sizeBytes, std::move(val)); } +template +void ihipMemcpy2dKernel(hipStream_t stream, T* dst, const T* src, size_t width, size_t height, size_t destPitch, size_t srcPitch) { + size_t threadsPerBlock_x = 64; + size_t threadsPerBlock_y = 4; + uint32_t grid_dim_x = clamp_integer( (width+(threadsPerBlock_x*sizeof(T)-1)) / (threadsPerBlock_x*sizeof(T)), 1, UINT32_MAX); + uint32_t grid_dim_y = clamp_integer( (height+(threadsPerBlock_y-1)) / threadsPerBlock_y, 1, UINT32_MAX); + hipLaunchKernelGGL(hip_copy2d_n, dim3(grid_dim_x,grid_dim_y), dim3(threadsPerBlock_x,threadsPerBlock_y), 0u, stream, dst, src, + width, height, destPitch, srcPitch); +} + typedef enum ihipMemsetDataType { ihipMemsetDataTypeChar = 0, ihipMemsetDataTypeShort = 1, @@ -1616,6 +1586,108 @@ hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t strea return e; }; +int isLockedPointer(const void *ptr) +{ + hsa_amd_pointer_info_t info; + int isLocked = 0; + + info.size = sizeof(info); + hsa_status_t hsa_status = hsa_amd_pointer_info(const_cast(ptr), &info, nullptr, nullptr, nullptr); + if(hsa_status != HSA_STATUS_SUCCESS) { + return -1; + } + + if((info.type == HSA_EXT_POINTER_TYPE_HSA) || (info.type == HSA_EXT_POINTER_TYPE_LOCKED)) { + isLocked = 1; + } + + return isLocked; +} + +// TODO - review and optimize +hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind) { + if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue; + + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); + int isLocked = 0; + if(kind == hipMemcpyHostToDevice) { + isLocked = isLockedPointer(src); + } else if(kind == hipMemcpyDeviceToHost) { + isLocked = isLockedPointer(dst); + } + + hc::completion_future marker; + + hipError_t e = hipSuccess; + if((width == dpitch) && (width == spitch)) { + stream->locked_copySync((void*)dst, (void*)src, width*height, kind, false); + } else { + try { + if(isLocked) { + for (int i = 0; i < height; ++i) + stream->locked_copySync((unsigned char*)dst + i * dpitch, + (unsigned char*)src + i * spitch, width, kind); + } else { + ihipMemcpy2dKernel (stream, static_cast (dst), static_cast (src), width, height, dpitch, spitch); + stream->locked_wait(); + } + } catch (ihipException& ex) { + e = ex._code; + } + } + + return e; +} + +hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind) { + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind); + hipError_t e = hipSuccess; + e = ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind); + return ihipLogStatus(e); +} + +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); + if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return ihipLogStatus(hipErrorInvalidValue); + hipError_t e = hipSuccess; + int isLocked = 0; + if(kind == hipMemcpyHostToDevice) { + isLocked = isLockedPointer(src); + } else if(kind == hipMemcpyDeviceToHost) { + isLocked = isLockedPointer(dst); + } + if((width == dpitch) && (width == spitch)) { + hip_internal::memcpyAsync(dst, src, width*height, kind, stream); + } else { + try { + if(!isLocked){ + for (int i = 0; i < height; ++i) + e = hip_internal::memcpyAsync((unsigned char*)dst + i * dpitch, + (unsigned char*)src + i * spitch, width, kind, stream); + } else{ + ihipMemcpy2dKernel (stream, static_cast (dst), static_cast (src), width, height, dpitch, spitch); + } + } catch (ihipException& ex) { + e = ex._code; + } + } + + return ihipLogStatus(e); +} + +hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { + HIP_INIT_SPECIAL_API((TRACE_MCMD), pCopy); + hipError_t e = hipSuccess; + if (pCopy == nullptr) { + e = hipErrorInvalidValue; + } + e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch, + pCopy->widthInBytes, pCopy->height, hipMemcpyDefault); + return ihipLogStatus(e); +} // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream) { @@ -1698,6 +1770,42 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes return ihipLogStatus(e); } +hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ) +{ + HIP_INIT_SPECIAL_API((TRACE_MCMD), &pitchedDevPtr, value, &extent); + hipError_t e = hipSuccess; + + hipStream_t stream = hipStreamNull; + // TODO - call an ihip memset so HIP_TRACE is correct. + stream = ihipSyncAndResolveStream(stream); + if (stream) { + size_t sizeBytes = pitchedDevPtr.pitch * extent.height * extent.depth; + e = ihipMemset(pitchedDevPtr.ptr, value, sizeBytes, stream, ihipMemsetDataTypeChar); + stream->locked_wait(); + } else { + e = hipErrorInvalidValue; + } + + return ihipLogStatus(e); +} + +hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ,hipStream_t stream ) +{ + HIP_INIT_SPECIAL_API((TRACE_MCMD), &pitchedDevPtr, value, &extent); + hipError_t e = hipSuccess; + + // TODO - call an ihip memset so HIP_TRACE is correct. + stream = ihipSyncAndResolveStream(stream); + if (stream) { + size_t sizeBytes = pitchedDevPtr.pitch * extent.height * extent.depth; + e = ihipMemset(pitchedDevPtr.ptr, value, sizeBytes, stream, ihipMemsetDataTypeChar); + } else { + e = hipErrorInvalidValue; + } + + return ihipLogStatus(e); +} + hipError_t hipMemGetInfo(size_t* free, size_t* total) { HIP_INIT_API(free, total); diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 84451a17a0..a6d486b6de 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -27,10 +27,11 @@ THE SOFTWARE. #include "hsa_helpers.hpp" #include "trace_helper.h" +#include #include #include -#include +#include #include #include #include @@ -71,9 +72,8 @@ struct ihipKernArgInfo { map kernelArguments; struct ihipModuleSymbol_t { - uint64_t _object; // The kernel object. - uint32_t _groupSegmentSize; - uint32_t _privateSegmentSize; + uint64_t _object{}; // The kernel object. + amd_kernel_code_t const* _header{}; string _name; // TODO - review for performance cost. Name is just used for debug. }; @@ -179,8 +179,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; - aql.group_segment_size = f->_groupSegmentSize + sharedMemBytes; - aql.private_segment_size = f->_privateSegmentSize; + aql.group_segment_size = + f->_header->workgroup_group_segment_byte_size + sharedMemBytes; + aql.private_segment_size = + f->_header->workitem_private_segment_byte_size; aql.kernel_object = f->_object; aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; aql.header = @@ -444,10 +446,10 @@ hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const ch if (kernel.handle == 0u) return hipErrorNotFound; - (*func)->_object = kernel_object(kernel); - (*func)->_groupSegmentSize = group_size(kernel); - (*func)->_privateSegmentSize = private_size(kernel); - (*func)->_name = name; + // TODO: refactor the whole ihipThisThat, which is a mess and yields the + // below, due to hipFunction_t being a pointer to ihipModuleSymbol_t. + func[0][0] = *static_cast( + Kernel_descriptor{kernel_object(kernel), name}); return hipSuccess; } @@ -471,6 +473,65 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t h return ihipLogStatus(r); } +namespace +{ + inline + hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) + { + hipFuncAttributes r{}; + + hipDeviceProp_t prop{}; + hipGetDeviceProperties( + &prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); + // TODO: at the moment there is no way to query the count of registers + // available per CU, therefore we hardcode it to 64 KiRegisters. + prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; + + r.localSizeBytes = header.workitem_private_segment_byte_size; + r.sharedSizeBytes = header.workgroup_group_segment_byte_size; + r.maxDynamicSharedSizeBytes = + prop.sharedMemPerBlock - r.sharedSizeBytes; + r.numRegs = header.workitem_vgpr_count; + r.maxThreadsPerBlock = r.numRegs ? + std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : + prop.maxThreadsPerBlock; + r.binaryVersion = + header.amd_machine_version_major * 10 + + header.amd_machine_version_minor; + r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. + + return r; + } +} + +hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) +{ + if (!attr) return hipErrorInvalidValue; + if (!func) return hipErrorInvalidDeviceFunction; + + const auto it0 = functions().find(reinterpret_cast(func)); + + if (it0 == functions().cend()) return hipErrorInvalidDeviceFunction; + + auto agent = this_agent(); + const auto it1 = find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const pair& x) { + return x.first == agent; + }); + + if (it1 == it0->second.cend()) return hipErrorInvalidDeviceFunction; + + const auto header = static_cast(it1->second)->_header; + + if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."}; + + *attr = make_function_attributes(*header); + + return hipSuccess; +} + hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) { if (!module) return hipErrorInvalidValue; @@ -487,7 +548,7 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) { (*module)->executable = hip_impl::load_executable( tmp.empty() ? read_elf_file_as_string(image) : tmp, (*module)->executable, this_agent()); - + return (*module)->executable.handle ? hipSuccess : hipErrorUnknown; } diff --git a/src/program_state.cpp b/src/program_state.cpp index 0c51fe4694..c4478bec2f 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -382,8 +382,7 @@ const unordered_map>>& fu for (auto&& kernel_symbol : it->second) { r[function.first].emplace_back( agent(kernel_symbol), - Kernel_descriptor{kernel_object(kernel_symbol), group_size(kernel_symbol), - private_size(kernel_symbol), it->first}); + Kernel_descriptor{kernel_object(kernel_symbol), it->first}); } } } diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp index 7fe0003672..f89bdae149 100644 --- a/tests/src/deviceLib/hipMathFunctions.cpp +++ b/tests/src/deviceLib/hipMathFunctions.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ @@ -29,7 +29,7 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ +#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ __global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) { int tx = threadIdx.x; diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp index 4a63260bf7..5a2aac2b29 100644 --- a/tests/src/deviceLib/hipTestHalf.cpp +++ b/tests/src/deviceLib/hipTestHalf.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ @@ -32,7 +32,7 @@ THE SOFTWARE. #define HALF_SIZE 64 * sizeof(__half) #define HALF2_SIZE 64 * sizeof(__half2) -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ +#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ __global__ void __halfMath(hipLaunchParm lp, __half* A, __half* B, __half* C) { int tx = threadIdx.x; diff --git a/tests/src/deviceLib/hip_mbcnt.cpp b/tests/src/deviceLib/hip_mbcnt.cpp new file mode 100644 index 0000000000..0dd7169f51 --- /dev/null +++ b/tests/src/deviceLib/hip_mbcnt.cpp @@ -0,0 +1,128 @@ +/* +Copyright (c) 2015-2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t + * HIT_END + */ + +#include +#include +#include +#include +#include +#include +#include + +#define HIP_ASSERT(x) (assert((x) == hipSuccess)) + +__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0); + mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0); + lane_id[x] = __lane_id(); +} + +using namespace std; + +int main() { + + unsigned int* device_mbcnt_lo; + unsigned int* device_mbcnt_hi; + unsigned int* device_lane_id; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + cout << " System minor " << devProp.minor << endl; + cout << " System major " << devProp.major << endl; + cout << " agent prop name " << devProp.name << endl; + + cout << "hip Device prop succeeded " << endl; + + constexpr unsigned int wave_size = 64; + constexpr unsigned int num_waves_per_block = 2; + constexpr unsigned int num_threads_per_block = wave_size * num_waves_per_block; + constexpr unsigned int num_blocks = 2; + constexpr unsigned int num_threads = num_threads_per_block * num_blocks; + constexpr size_t buffer_size = num_threads * sizeof(unsigned int); + + HIP_ASSERT(hipMalloc((void**)&device_mbcnt_lo, buffer_size)); + HIP_ASSERT(hipMalloc((void**)&device_mbcnt_hi, buffer_size)); + HIP_ASSERT(hipMalloc((void**)&device_lane_id, buffer_size)); + + hipLaunchKernel(HIP_kernel, dim3(num_blocks), + dim3(num_threads_per_block), 0, 0, device_mbcnt_lo, device_mbcnt_hi, device_lane_id); + + unsigned int* host_mbcnt_lo = (unsigned int*) malloc(buffer_size); + unsigned int* host_mbcnt_hi = (unsigned int*) malloc(buffer_size); + unsigned int* host_lane_id = (unsigned int*) malloc(buffer_size); + + HIP_ASSERT(hipMemcpy(host_mbcnt_lo, device_mbcnt_lo, buffer_size, hipMemcpyDeviceToHost)); + HIP_ASSERT(hipMemcpy(host_mbcnt_hi, device_mbcnt_hi, buffer_size, hipMemcpyDeviceToHost)); + HIP_ASSERT(hipMemcpy(host_lane_id, device_lane_id, buffer_size, hipMemcpyDeviceToHost)); + + // verify the results + int mbcnt_lo_errors = 0; + int mbcnt_hi_errors = 0; + int lane_id_errors = 0; + for (unsigned int i = 0; i < num_threads; i++) { + unsigned int this_lane_id = i % wave_size; + unsigned int this_mbcnt_lo = this_lane_id >= 32 ? 32 : this_lane_id; + unsigned int this_mbcnt_hi = this_lane_id < 32 ? 0 : (this_lane_id - 22); + + if (host_mbcnt_lo[i] != this_mbcnt_lo) + mbcnt_lo_errors++; + + if (host_mbcnt_hi[i] != this_mbcnt_hi) + mbcnt_hi_errors++; + + if (host_lane_id[i] != this_lane_id) + lane_id_errors++; + } + + if (mbcnt_lo_errors == 0) + cout << "__mbcnt_lo() PASSED!" << endl; + else + cout << "__mbcnt_lo() FAILED!" << endl; + + + if (mbcnt_hi_errors == 0) + cout << "__mbcnt_hi() PASSED!" << endl; + else + cout << "__mbcnt_hi() FAILED!" << endl; + + if (lane_id_errors == 0) + cout << "__lane_id() PASSED!" << endl; + else + cout << "__lane_id() FAILED!" << endl; + + HIP_ASSERT(hipFree(device_mbcnt_lo)); + HIP_ASSERT(hipFree(device_mbcnt_hi)); + HIP_ASSERT(hipFree(device_lane_id)); + + free(host_mbcnt_lo); + free(host_mbcnt_hi); + free(host_lane_id); + + return mbcnt_lo_errors + mbcnt_hi_errors + lane_id_errors; +} diff --git a/tests/src/runtimeApi/memory/hipMemset3D.cpp b/tests/src/runtimeApi/memory/hipMemset3D.cpp new file mode 100644 index 0000000000..40f2f3e67f --- /dev/null +++ b/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -0,0 +1,98 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for memset. +// Also serves as a template for other tests. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * //Small copy + * RUN: %t -N 10 --memsetval 0x42 + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +bool testhipMemset3D(int memsetval,int p_gpuDevice) +{ + size_t numH = 256; + size_t numW = 256; + size_t depth = 1; + size_t pitch_A; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH * depth; + size_t elements = numW* numH* depth; + + + printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); + char *A_d; + char *A_h; + bool testResult = true; + hipExtent extent = make_hipExtent(width, numH, depth); + hipPitchedPtr devPitchedPtr; + + HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); + A_h = (char*)malloc(sizeElements); + HIPASSERT(A_h != NULL); + for (size_t i=0; i +#include +#include "test_common.h" + +__global__ +void fn(float* px, float* py) +{ + bool a[42]; + __shared__ double b[69]; + + for (auto&& x : b) x = *py++; + for (auto&& x : a) x = *px++ > 0.0; + for (auto&& x : a) if (x) *--py = *--px; +} + +int main() { + hipInit(0); + + hipFuncAttributes attr{}; + + auto r = hipFuncGetAttributes(&attr, reinterpret_cast(&fn)); + + if (r != hipSuccess || attr.maxThreadsPerBlock == 0) { + failed("Failed to read attributes."); + } + + passed(); +} diff --git a/tests/src/texture/hipBindTexRef1DFetch.cpp b/tests/src/texture/hipBindTexRef1DFetch.cpp new file mode 100644 index 0000000000..5d9a1b103b --- /dev/null +++ b/tests/src/texture/hipBindTexRef1DFetch.cpp @@ -0,0 +1,95 @@ +/* +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#define N 512 + +texture tex; + +__global__ void kernel(float *out) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + if(x(obj, k); +} + +int runTest(void); + +int main(int argc, char **argv) { + int testResult = runTest(); + if(testResult) { + passed(); + } else { + exit(EXIT_FAILURE); + } +} + +int runTest() { + int testResult = 1; + // Allocating the required buffer on gpu device + float *texBuf, *texBufOut; + float val[N], output[N]; + for (int i = 0; i < N; i++) { + val[i] = (i + 1) * (i + 1); + output[i] = 0.0; + } + HIPCHECK(hipMalloc(&texBuf, N * sizeof(float))); + HIPCHECK(hipMalloc(&texBufOut, N * sizeof(float))); + HIPCHECK(hipMemcpy(texBuf, val, N * sizeof(float), hipMemcpyHostToDevice)); + HIPCHECK(hipMemset(texBufOut, 0, N * sizeof(float))); + hipResourceDesc resDescLinear; + + memset(&resDescLinear, 0, sizeof(resDescLinear)); + resDescLinear.resType = hipResourceTypeLinear; + resDescLinear.res.linear.devPtr = texBuf; + resDescLinear.res.linear.desc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); + resDescLinear.res.linear.sizeInBytes = N * sizeof(float); + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = hipReadModeElementType; + + // Creating texture object + hipTextureObject_t texObj = 0; + HIPCHECK(hipCreateTextureObject(&texObj, &resDescLinear, &texDesc, NULL)); + + dim3 dimBlock(64, 1, 1); + dim3 dimGrid(N / dimBlock.x, 1, 1); + + hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, + texBufOut, texObj); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipMemcpy(output, texBufOut, N * sizeof(float), hipMemcpyDeviceToHost)); + + for(int i = 0; i < N; i++) + if (output[i] != val[i]) { + testResult = 0; + break; + } + + HIPCHECK(hipDestroyTextureObject(texObj)); + HIPCHECK(hipFree(texBuf)); + HIPCHECK(hipFree(texBufOut)); + return testResult; +} diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp index 1bf51bc2cb..e214295989 100644 --- a/tests/src/texture/hipTextureObj2D.cpp +++ b/tests/src/texture/hipTextureObj2D.cpp @@ -1,108 +1,108 @@ -/* HIT_START - * BUILD: %t %s ../test_common.cpp - * RUN: %t - * HIT_END - */ -#include -#include -#include - -#include -#include "test_common.h" - -bool testResult = true; - -__global__ void tex2DKernel(float* outputData, hipTextureObject_t textureObject, int width, - int height) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - outputData[y * width + x] = tex2D(textureObject, x, y); -} - -void runTest(int argc, char** argv); - -int main(int argc, char** argv) { - runTest(argc, argv); - - if (testResult) { - passed(); - } else { - exit(EXIT_FAILURE); - } -} - -void runTest(int argc, char** argv) { - unsigned int width = 256; - unsigned int height = 256; - unsigned int size = width * height * sizeof(float); - float* hData = (float*)malloc(size); - memset(hData, 0, size); - for (int i = 0; i < height; i++) { - for (int j = 0; j < width; j++) { - hData[i * width + j] = i * width + j; - } - } - printf("hData: "); - for (int i = 0; i < 64; i++) { - printf("%f ", hData[i]); - } - printf("\n"); - - hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); - hipArray* hipArray; - hipMallocArray(&hipArray, &channelDesc, width, height); - - hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice); - - struct hipResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = hipResourceTypeArray; - resDesc.res.array.array = hipArray; - - // Specify texture object parameters - struct hipTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = hipAddressModeWrap; - texDesc.addressMode[1] = hipAddressModeWrap; - texDesc.filterMode = hipFilterModePoint; - texDesc.readMode = hipReadModeElementType; - texDesc.normalizedCoords = 0; - - // Create texture object - hipTextureObject_t textureObject = 0; - hipCreateTextureObject(&textureObject, &resDesc, &texDesc, NULL); - - float* dData = NULL; - hipMalloc((void**)&dData, size); - - dim3 dimBlock(16, 16, 1); - dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); - - hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, textureObject, - width, height); - - hipDeviceSynchronize(); - - float* hOutputData = (float*)malloc(size); - memset(hOutputData, 0, size); - hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); - - printf("dData: "); - for (int i = 0; i < 64; i++) { - printf("%f ", hOutputData[i]); - } - printf("\n"); - for (int i = 0; i < height; i++) { - for (int j = 0; j < width; j++) { - if (hData[i * width + j] != hOutputData[i * width + j]) { - printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j], - hOutputData[i * width + j]); - testResult = false; - break; - } - } - } - hipDestroyTextureObject(textureObject); - hipFree(dData); - hipFreeArray(hipArray); -} +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ +#include +#include +#include + +#include +#include "test_common.h" + +__global__ void tex2DKernel(float* outputData, hipTextureObject_t textureObject, int width, + int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + outputData[y * width + x] = tex2D(textureObject, x, y); +} + +int runTest(int argc, char** argv); + +int main(int argc, char** argv) { + int testResult = runTest(argc, argv); + + if (testResult) { + passed(); + } else { + exit(EXIT_FAILURE); + } +} + +int runTest(int argc, char** argv) { + int testResult = 1; + unsigned int width = 256; + unsigned int height = 256; + unsigned int size = width * height * sizeof(float); + float* hData = (float*)malloc(size); + memset(hData, 0, size); + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + hData[i * width + j] = i * width + j; + } + } + printf("hData: "); + for (int i = 0; i < 64; i++) { + printf("%f ", hData[i]); + } + printf("\n"); + + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); + hipArray* hipArray; + hipMallocArray(&hipArray, &channelDesc, width, height); + + hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice); + + struct hipResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = hipResourceTypeArray; + resDesc.res.array.array = hipArray; + + // Specify texture object parameters + struct hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = hipAddressModeWrap; + texDesc.addressMode[1] = hipAddressModeWrap; + texDesc.filterMode = hipFilterModePoint; + texDesc.readMode = hipReadModeElementType; + texDesc.normalizedCoords = 0; + + // Create texture object + hipTextureObject_t textureObject = 0; + hipCreateTextureObject(&textureObject, &resDesc, &texDesc, NULL); + + float* dData = NULL; + hipMalloc((void**)&dData, size); + + dim3 dimBlock(16, 16, 1); + dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); + + hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, textureObject, + width, height); + + hipDeviceSynchronize(); + + float* hOutputData = (float*)malloc(size); + memset(hOutputData, 0, size); + hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); + + printf("dData: "); + for (int i = 0; i < 64; i++) { + printf("%f ", hOutputData[i]); + } + printf("\n"); + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + if (hData[i * width + j] != hOutputData[i * width + j]) { + printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j], + hOutputData[i * width + j]); + testResult = 0; + break; + } + } + } + hipDestroyTextureObject(textureObject); + hipFree(dData); + hipFreeArray(hipArray); + return testResult; +} diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp index c4c0b9e2fe..b912f789a7 100644 --- a/tests/src/texture/hipTextureRef2D.cpp +++ b/tests/src/texture/hipTextureRef2D.cpp @@ -1,93 +1,93 @@ -/* HIT_START - * BUILD: %t %s ../test_common.cpp - * RUN: %t - * HIT_END - */ -#include -#include -#include - -#include -#include "test_common.h" -texture tex; - -bool testResult = true; - -__global__ void tex2DKernel(float* outputData, - int width, int height) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - outputData[y * width + x] = tex2D(tex, x, y); -} - -void runTest(int argc, char** argv); - -int main(int argc, char** argv) { - runTest(argc, argv); - if (testResult) { - passed(); - } else { - exit(EXIT_FAILURE); - } -} - -void runTest(int argc, char** argv) { - unsigned int width = 256; - unsigned int height = 256; - unsigned int size = width * height * sizeof(float); - float* hData = (float*)malloc(size); - memset(hData, 0, size); - for (int i = 0; i < height; i++) { - for (int j = 0; j < width; j++) { - hData[i * width + j] = i * width + j; - } - } - printf("hData: "); - for (int i = 0; i < 64; i++) { - printf("%f ", hData[i]); - } - printf("\n"); - - hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); - hipArray* hipArray; - hipMallocArray(&hipArray, &channelDesc, width, height); - - hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice); - - tex.addressMode[0] = hipAddressModeWrap; - tex.addressMode[1] = hipAddressModeWrap; - tex.filterMode = hipFilterModePoint; - tex.normalized = 0; - - hipBindTextureToArray(tex, hipArray, channelDesc); - - float* dData = NULL; - hipMalloc((void**)&dData, size); - - dim3 dimBlock(16, 16, 1); - dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); - hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, width, height); - hipDeviceSynchronize(); - - float* hOutputData = (float*)malloc(size); - memset(hOutputData, 0, size); - hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); - - printf("dData: "); - for (int i = 0; i < 64; i++) { - printf("%f ", hOutputData[i]); - } - printf("\n"); - for (int i = 0; i < height; i++) { - for (int j = 0; j < width; j++) { - if (hData[i * width + j] != hOutputData[i * width + j]) { - printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j], - hOutputData[i * width + j]); - testResult = false; - break; - } - } - } - hipFree(dData); - hipFreeArray(hipArray); -} +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ +#include +#include +#include + +#include +#include "test_common.h" +texture tex; + +__global__ void tex2DKernel(float* outputData, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + outputData[y * width + x] = tex2D(tex, x, y); +} + +int runTest(int argc, char** argv); + +int main(int argc, char** argv) { + int testResult = runTest(argc, argv); + if (testResult) { + passed(); + } else { + exit(EXIT_FAILURE); + } +} + +int runTest(int argc, char** argv) { + int testResult = 1; + unsigned int width = 256; + unsigned int height = 256; + unsigned int size = width * height * sizeof(float); + float* hData = (float*)malloc(size); + memset(hData, 0, size); + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + hData[i * width + j] = i * width + j; + } + } + printf("hData: "); + for (int i = 0; i < 64; i++) { + printf("%f ", hData[i]); + } + printf("\n"); + + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); + hipArray* hipArray; + hipMallocArray(&hipArray, &channelDesc, width, height); + + hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice); + + tex.addressMode[0] = hipAddressModeWrap; + tex.addressMode[1] = hipAddressModeWrap; + tex.filterMode = hipFilterModePoint; + tex.normalized = 0; + + hipBindTextureToArray(tex, hipArray, channelDesc); + + float* dData = NULL; + hipMalloc((void**)&dData, size); + + dim3 dimBlock(16, 16, 1); + dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); + hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, width, height); + hipDeviceSynchronize(); + + float* hOutputData = (float*)malloc(size); + memset(hOutputData, 0, size); + hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); + + printf("dData: "); + for (int i = 0; i < 64; i++) { + printf("%f ", hOutputData[i]); + } + printf("\n"); + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + if (hData[i * width + j] != hOutputData[i * width + j]) { + printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j], + hOutputData[i * width + j]); + testResult = 0; + break; + } + } + } + hipFree(dData); + hipFreeArray(hipArray); + return testResult; +}