diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cd785fe336..060e70e519 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl - Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs. - FIXME refers to a short-term bug that needs to be addressed. +- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized, + and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match + those of the parent fucntion. + #### Presubmit Testing: Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point: diff --git a/bin/hipcc b/bin/hipcc index c51e351382..8fde7f426d 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") { } # Satisfy HCC dependencies - $HIPLDFLAGS .= " -lc++abi"; + $HIPLDFLAGS .= " -lc++abi -lsupc++"; $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt"; # Handle ROCm target platform @@ -151,13 +151,12 @@ if ($HIP_PLATFORM eq "hcc") { $HIPCC="$CUDA_PATH/bin/nvcc"; $HIPCXXFLAGS .= " -I$CUDA_PATH/include"; - $HIPLDFLAGS = ""; + $HIPLDFLAGS = "-lcuda -lcudart"; } else { printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'"); exit (-1); } - # Add paths to common HIP includes: $HIPCXXFLAGS .= " -I$HIP_PATH/include" ; @@ -177,7 +176,41 @@ if ($verbose & 0x4) { print "hipcc-args: ", join (" ", @ARGV), "\n"; } +# Handle code object generation +my $ISACMD=""; +if($HIP_PLATFORM eq "hcc"){ + $ISACMD .= "$HIP_PATH/bin/hipgenisa.sh "; + $ISACMD .= $ROCM_PATH; + if($ARGV[0] eq "--gencodeobject"){ + foreach $isaarg (@ARGV[1..$#ARGV]){ + $ISACMD .= " "; + $ISACMD .= $isaarg; + } + if ($verbose & 0x1) { + print "hipcc-cmd: ", $ISACMD, "\n"; + } + system($ISACMD) and die(); + exit(0); + } +} + +if($HIP_PLATFORM eq "nvcc"){ + $ISACMD .= "$HIP_PATH/bin/hipcc -ptx "; + if($ARGV[0] eq "--gencodeobject"){ + foreach $isaarg (@ARGV[1..$#ARGV]){ + $ISACMD .= " "; + $ISACMD .= $isaarg; + } + if ($verbose & 0x1) { + print "hipcc-cmd: ", $ISACMD, "\n"; + } + system($ISACMD) and die(); + exit(0); + } +} + my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool + foreach $arg (@ARGV) { my $swallowArg = 0; diff --git a/bin/hipgenisa.sh b/bin/hipgenisa.sh new file mode 100755 index 0000000000..e60abbf78f --- /dev/null +++ b/bin/hipgenisa.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +if [ $1 = " " ] +then +exit +fi + +ROCM_PATH=$1 +GEN_ISA=$2 +FILE_NAMES=$3 +OUT=$4 +OUTPUT_FILE=$5 +TARGET="" +if [ ${GEN_ISA:0:12} = "--target-isa" ] +then + TARGET=${GEN_ISA:13:12} +fi + +SOURCE="${BASH_SOURCE[0]}" +HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )" + +export KMDUMPISA=1 +export KMDUMPLLVM=1 +mkdir /tmp/hipgenisa +$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out +mv dump.* /tmp/hipgenisa/ +$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o +$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE +rm -r /tmp/hipgenisa +export KMDUMPISA=0 +export KMDUMPLLVM=0 diff --git a/clang-hipify/src/Cuda2Hip.cpp b/clang-hipify/src/Cuda2Hip.cpp index 45960ac8ca..fde5b7e55b 100644 --- a/clang-hipify/src/Cuda2Hip.cpp +++ b/clang-hipify/src/Cuda2Hip.cpp @@ -70,16 +70,27 @@ enum ConvTypes { CONV_OTHER, CONV_INCLUDE, CONV_INCLUDE_CUDA_MAIN_H, + CONV_TYPE, CONV_LITERAL, - CONV_BLAS, + CONV_NUMERIC_LITERAL, CONV_LAST }; -const char *counterNames[ConvTypes::CONV_LAST] = { - "dev", "mem", "kern", "coord_func", "math_func", - "special_func", "stream", "event", "err", "def", - "tex", "other", "include", "include_cuda_main_header", - "literal", "blas"}; +const char *counterNames[CONV_LAST] = { + "dev", "mem", "kern", "coord_func", "math_func", + "special_func", "stream", "event", "err", "def", + "tex", "other", "include", "include_cuda_main_header", + "type", "literal", "numeric_literal"}; + +enum ApiTypes { + API_DRIVER = 0, + API_RUNTIME, + API_BLAS, + API_LAST +}; + +const char *apiNames[API_LAST] = { + "CUDA API", "CUDA RT API", "CUDA BLAS API"}; namespace { @@ -90,560 +101,560 @@ struct cuda2hipMap { cudaExcludes = {"CHECK_CUDA_ERROR", "CUDA_SAFE_CALL"}; // Defines - cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF}; + cuda2hipRename["__CUDACC__"] = {"__HIPCC__", CONV_DEF, API_RUNTIME}; // CUDA includes - cuda2hipRename["cuda.h"] = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H}; - cuda2hipRename["cuda_runtime.h"] = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H}; - cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE}; + cuda2hipRename["cuda.h"] = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}; + cuda2hipRename["cuda_runtime.h"] = {"hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}; + cuda2hipRename["cuda_runtime_api.h"] = {"hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}; // HIP includes // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h - //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE}; + //cuda2hipRename["cudacommon.h"] = {"hipcommon.h", CONV_INCLUDE, API_RUNTIME}; // CUBLAS includes - cuda2hipRename["cublas.h"] = {"hipblas.h", CONV_INCLUDE}; - cuda2hipRename["cublas_v2.h"] = {"hipblas.h", CONV_INCLUDE}; + cuda2hipRename["cublas.h"] = {"hipblas.h", CONV_INCLUDE, API_BLAS}; + cuda2hipRename["cublas_v2.h"] = {"hipblas.h", CONV_INCLUDE, API_BLAS}; // Error codes and return types - cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_ERR}; - cuda2hipRename["cudaError"] = {"hipError", CONV_ERR}; - cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR}; - cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR}; - cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR}; - cuda2hipRename["cudaErrorMemoryFree"] = {"hipErrorMemoryFree", CONV_ERR}; - cuda2hipRename["cudaErrorUnknownSymbol"] = {"hipErrorUnknownSymbol", CONV_ERR}; - cuda2hipRename["cudaErrorOutOfResources"] = {"hipErrorOutOfResources", CONV_ERR}; - cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR}; - cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR}; - cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR}; - cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR}; - cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR}; - cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInvalidDevicePointer", CONV_ERR}; - cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR}; - cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR}; - cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR}; - cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR}; + cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaError"] = {"hipError", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorMemoryFree"] = {"hipErrorMemoryFree", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorUnknownSymbol"] = {"hipErrorUnknownSymbol", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorOutOfResources"] = {"hipErrorOutOfResources", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // NOTE: no corresponding error type in CUDA - //cuda2hipRename["cudaErrorRuntimeMemory"] = {"hipErrorRuntimeMemory", CONV_ERR}; - //cuda2hipRename["cudaErrorRuntimeOther"] = {"hipErrorRuntimeOther", CONV_ERR}; - cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR}; - cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR}; + //cuda2hipRename["cudaErrorRuntimeMemory"] = {"hipErrorRuntimeMemory", CONV_ERR, API_RUNTIME}; + //cuda2hipRename["cudaErrorRuntimeOther"] = {"hipErrorRuntimeOther", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // Error API - cuda2hipRename["cudaGetLastError"] = {"hipGetLastError", CONV_ERR}; - cuda2hipRename["cudaPeekAtLastError"] = {"hipPeekAtLastError", CONV_ERR}; - cuda2hipRename["cudaGetErrorName"] = {"hipGetErrorName", CONV_ERR}; - cuda2hipRename["cudaGetErrorString"] = {"hipGetErrorString", CONV_ERR}; + cuda2hipRename["cudaGetLastError"] = {"hipGetLastError", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaPeekAtLastError"] = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaGetErrorName"] = {"hipGetErrorName", CONV_ERR, API_RUNTIME}; + cuda2hipRename["cudaGetErrorString"] = {"hipGetErrorString", CONV_ERR, API_RUNTIME}; // Memcpy - cuda2hipRename["cudaMemcpy"] = {"hipMemcpy", CONV_MEM}; - cuda2hipRename["cudaMemcpyToSymbol"] = {"hipMemcpyToSymbol", CONV_MEM}; - cuda2hipRename["cudaMemset"] = {"hipMemset", CONV_MEM}; - cuda2hipRename["cudaMemsetAsync"] = {"hipMemsetAsync", CONV_MEM}; - cuda2hipRename["cudaMemcpyAsync"] = {"hipMemcpyAsync", CONV_MEM}; - cuda2hipRename["cudaMemGetInfo"] = {"hipMemGetInfo", CONV_MEM}; + cuda2hipRename["cudaMemcpy"] = {"hipMemcpy", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyToSymbol"] = {"hipMemcpyToSymbol", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemset"] = {"hipMemset", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemsetAsync"] = {"hipMemsetAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyAsync"] = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemGetInfo"] = {"hipMemGetInfo", CONV_MEM, API_RUNTIME}; // Memcpy kind - cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM}; - cuda2hipRename["cudaMemcpyHostToHost"] = {"hipMemcpyHostToHost", CONV_MEM}; - cuda2hipRename["cudaMemcpyHostToDevice"] = {"hipMemcpyHostToDevice", CONV_MEM}; - cuda2hipRename["cudaMemcpyDeviceToHost"] = {"hipMemcpyDeviceToHost", CONV_MEM}; - cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM}; - cuda2hipRename["cudaMemcpyDefault"] = {"hipMemcpyDefault", CONV_MEM}; + cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyHostToHost"] = {"hipMemcpyHostToHost", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyHostToDevice"] = {"hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyDeviceToHost"] = {"hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyDeviceToDevice"] = {"hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyDefault"] = {"hipMemcpyDefault", CONV_MEM, API_RUNTIME}; // Memory management - cuda2hipRename["cudaMalloc"] = {"hipMalloc", CONV_MEM}; - cuda2hipRename["cudaMallocHost"] = {"hipHostMalloc", CONV_MEM}; - cuda2hipRename["cudaFree"] = {"hipFree", CONV_MEM}; - cuda2hipRename["cudaFreeHost"] = {"hipHostFree", CONV_MEM}; - cuda2hipRename["cudaHostRegister"] = {"hipHostRegister", CONV_MEM}; - cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM}; + cuda2hipRename["cudaMalloc"] = {"hipMalloc", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMallocHost"] = {"hipHostMalloc", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaFree"] = {"hipFree", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaFreeHost"] = {"hipHostFree", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostRegister"] = {"hipHostRegister", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_RUNTIME}; // Memory types - cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM}; - cuda2hipRename["cudaMemoryTypeHost"] = {"hipMemoryTypeHost", CONV_MEM}; - cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM}; + cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemoryTypeHost"] = {"hipMemoryTypeHost", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM, API_RUNTIME}; // Host Malloc Flags - cuda2hipRename["cudaHostAllocDefault"] = {"hipHostMallocDefault", CONV_MEM}; - cuda2hipRename["cudaHostAllocPortable"] = {"hipHostMallocPortable", CONV_MEM}; - cuda2hipRename["cudaHostAllocMapped"] = {"hipHostMallocMapped", CONV_MEM}; - cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM}; + cuda2hipRename["cudaHostAllocDefault"] = {"hipHostMallocDefault", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostAllocPortable"] = {"hipHostMallocPortable", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostAllocMapped"] = {"hipHostMallocMapped", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostAllocWriteCombined"] = {"hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME}; // Host Register Flags - cuda2hipRename["cudaHostGetFlags"] = {"hipHostGetFlags", CONV_MEM}; - cuda2hipRename["cudaHostRegisterDefault"] = {"hipHostRegisterDefault", CONV_MEM}; - cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM}; - cuda2hipRename["cudaHostRegisterMapped"] = {"hipHostRegisterMapped", CONV_MEM}; - cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM}; + cuda2hipRename["cudaHostGetFlags"] = {"hipHostGetFlags", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostRegisterDefault"] = {"hipHostRegisterDefault", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostRegisterPortable"] = {"hipHostRegisterPortable", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostRegisterMapped"] = {"hipHostRegisterMapped", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostRegisterIoMemory"] = {"hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME}; // Coordinate Indexing and Dimensions - cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC}; - cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC}; - cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC}; + cuda2hipRename["threadIdx.x"] = {"hipThreadIdx_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["threadIdx.y"] = {"hipThreadIdx_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["threadIdx.z"] = {"hipThreadIdx_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["blockIdx.x"] = {"hipBlockIdx_x", CONV_COORD_FUNC}; - cuda2hipRename["blockIdx.y"] = {"hipBlockIdx_y", CONV_COORD_FUNC}; - cuda2hipRename["blockIdx.z"] = {"hipBlockIdx_z", CONV_COORD_FUNC}; + cuda2hipRename["blockIdx.x"] = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockIdx.y"] = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockIdx.z"] = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["blockDim.x"] = {"hipBlockDim_x", CONV_COORD_FUNC}; - cuda2hipRename["blockDim.y"] = {"hipBlockDim_y", CONV_COORD_FUNC}; - cuda2hipRename["blockDim.z"] = {"hipBlockDim_z", CONV_COORD_FUNC}; + cuda2hipRename["blockDim.x"] = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockDim.y"] = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockDim.z"] = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["gridDim.x"] = {"hipGridDim_x", CONV_COORD_FUNC}; - cuda2hipRename["gridDim.y"] = {"hipGridDim_y", CONV_COORD_FUNC}; - cuda2hipRename["gridDim.z"] = {"hipGridDim_z", CONV_COORD_FUNC}; + cuda2hipRename["gridDim.x"] = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["gridDim.y"] = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["gridDim.z"] = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["blockIdx.x"] = {"hipBlockIdx_x", CONV_COORD_FUNC}; - cuda2hipRename["blockIdx.y"] = {"hipBlockIdx_y", CONV_COORD_FUNC}; - cuda2hipRename["blockIdx.z"] = {"hipBlockIdx_z", CONV_COORD_FUNC}; + cuda2hipRename["blockIdx.x"] = {"hipBlockIdx_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockIdx.y"] = {"hipBlockIdx_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockIdx.z"] = {"hipBlockIdx_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["blockDim.x"] = {"hipBlockDim_x", CONV_COORD_FUNC}; - cuda2hipRename["blockDim.y"] = {"hipBlockDim_y", CONV_COORD_FUNC}; - cuda2hipRename["blockDim.z"] = {"hipBlockDim_z", CONV_COORD_FUNC}; + cuda2hipRename["blockDim.x"] = {"hipBlockDim_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockDim.y"] = {"hipBlockDim_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["blockDim.z"] = {"hipBlockDim_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["gridDim.x"] = {"hipGridDim_x", CONV_COORD_FUNC}; - cuda2hipRename["gridDim.y"] = {"hipGridDim_y", CONV_COORD_FUNC}; - cuda2hipRename["gridDim.z"] = {"hipGridDim_z", CONV_COORD_FUNC}; + cuda2hipRename["gridDim.x"] = {"hipGridDim_x", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["gridDim.y"] = {"hipGridDim_y", CONV_COORD_FUNC, API_RUNTIME}; + cuda2hipRename["gridDim.z"] = {"hipGridDim_z", CONV_COORD_FUNC, API_RUNTIME}; - cuda2hipRename["warpSize"] = {"hipWarpSize", CONV_SPECIAL_FUNC}; + cuda2hipRename["warpSize"] = {"hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME}; // Events - cuda2hipRename["cudaEvent_t"] = {"hipEvent_t", CONV_EVENT}; - cuda2hipRename["cudaEventCreate"] = {"hipEventCreate", CONV_EVENT}; - cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT}; - cuda2hipRename["cudaEventDestroy"] = {"hipEventDestroy", CONV_EVENT}; - cuda2hipRename["cudaEventRecord"] = {"hipEventRecord", CONV_EVENT}; - cuda2hipRename["cudaEventElapsedTime"] = {"hipEventElapsedTime", CONV_EVENT}; - cuda2hipRename["cudaEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT}; - cuda2hipRename["cudaEventQuery"] = {"hipEventQuery", CONV_EVENT}; + cuda2hipRename["cudaEvent_t"] = {"hipEvent_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaEventCreate"] = {"hipEventCreate", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventCreateWithFlags"] = {"hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventDestroy"] = {"hipEventDestroy", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventRecord"] = {"hipEventRecord", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventElapsedTime"] = {"hipEventElapsedTime", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventQuery"] = {"hipEventQuery", CONV_EVENT, API_RUNTIME}; // Event Flags - cuda2hipRename["cudaEventDefault"] = {"hipEventDefault", CONV_EVENT}; - cuda2hipRename["cudaEventBlockingSync"] = {"hipEventBlockingSync", CONV_EVENT}; - cuda2hipRename["cudaEventDisableTiming"] = {"hipEventDisableTiming", CONV_EVENT}; - cuda2hipRename["cudaEventInterprocess"] = {"hipEventInterprocess", CONV_EVENT}; + cuda2hipRename["cudaEventDefault"] = {"hipEventDefault", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventBlockingSync"] = {"hipEventBlockingSync", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventDisableTiming"] = {"hipEventDisableTiming", CONV_EVENT, API_RUNTIME}; + cuda2hipRename["cudaEventInterprocess"] = {"hipEventInterprocess", CONV_EVENT, API_RUNTIME}; // Streams - cuda2hipRename["cudaStream_t"] = {"hipStream_t", CONV_STREAM}; - cuda2hipRename["cudaStreamCreate"] = {"hipStreamCreate", CONV_STREAM}; - cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM}; - cuda2hipRename["cudaStreamDestroy"] = {"hipStreamDestroy", CONV_STREAM}; - cuda2hipRename["cudaStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM}; - cuda2hipRename["cudaStreamSynchronize"] = {"hipStreamSynchronize", CONV_STREAM}; + cuda2hipRename["cudaStream_t"] = {"hipStream_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamCreateWithFlags"] = {"hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamDestroy"] = {"hipStreamDestroy", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamSynchronize"] = {"hipStreamSynchronize", CONV_STREAM, API_RUNTIME}; // Stream Flags - cuda2hipRename["cudaStreamGetFlags"] = {"hipStreamGetFlags", CONV_STREAM}; - cuda2hipRename["cudaStreamDefault"] = {"hipStreamDefault", CONV_STREAM}; - cuda2hipRename["cudaStreamNonBlocking"] = {"hipStreamNonBlocking", CONV_STREAM}; + cuda2hipRename["cudaStreamGetFlags"] = {"hipStreamGetFlags", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamDefault"] = {"hipStreamDefault", CONV_STREAM, API_RUNTIME}; + cuda2hipRename["cudaStreamNonBlocking"] = {"hipStreamNonBlocking", CONV_STREAM, API_RUNTIME}; // Other synchronization - cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV}; + cuda2hipRename["cudaDeviceSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME}; // translate deprecated cudaThreadSynchronize - cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV}; - cuda2hipRename["cudaDeviceReset"] = {"hipDeviceReset", CONV_DEV}; + cuda2hipRename["cudaThreadSynchronize"] = {"hipDeviceSynchronize", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceReset"] = {"hipDeviceReset", CONV_DEV, API_RUNTIME}; // translate deprecated cudaThreadExit - cuda2hipRename["cudaThreadExit"] = {"hipDeviceReset", CONV_DEV}; - cuda2hipRename["cudaSetDevice"] = {"hipSetDevice", CONV_DEV}; - cuda2hipRename["cudaGetDevice"] = {"hipGetDevice", CONV_DEV}; + cuda2hipRename["cudaThreadExit"] = {"hipDeviceReset", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaSetDevice"] = {"hipSetDevice", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaGetDevice"] = {"hipGetDevice", CONV_DEV, API_RUNTIME}; // Attributes - cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_DEV}; - cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV}; + cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV}; - cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV}; - cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV}; - cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV}; - cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV}; - cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV}; - cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV}; - cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV}; - cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV}; - cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV}; - cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV}; - cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV}; - cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV}; - cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV}; + cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME}; // Pointer Attributes - cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_MEM}; - cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM}; + cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME}; - cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM}; + cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME}; // Device - cuda2hipRename["cudaDeviceProp"] = {"hipDeviceProp_t", CONV_DEV}; - cuda2hipRename["cudaGetDeviceProperties"] = {"hipGetDeviceProperties", CONV_DEV}; + cuda2hipRename["cudaDeviceProp"] = {"hipDeviceProp_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaGetDeviceProperties"] = {"hipGetDeviceProperties", CONV_DEV, API_RUNTIME}; // Device Flags - cuda2hipRename["cudaSetDeviceFlags"] = {"hipSetDeviceFlags", CONV_DEV}; - cuda2hipRename["cudaDeviceScheduleAuto"] = {"hipDeviceScheduleAuto", CONV_DEV}; - cuda2hipRename["cudaDeviceScheduleSpin"] = {"hipDeviceScheduleSpin", CONV_DEV}; - cuda2hipRename["cudaDeviceScheduleYield"] = {"hipDeviceScheduleYield", CONV_DEV}; + cuda2hipRename["cudaSetDeviceFlags"] = {"hipSetDeviceFlags", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceScheduleAuto"] = {"hipDeviceScheduleAuto", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceScheduleSpin"] = {"hipDeviceScheduleSpin", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceScheduleYield"] = {"hipDeviceScheduleYield", CONV_DEV, API_RUNTIME}; // deprecated as of CUDA 4.0 and replaced with cudaDeviceScheduleBlockingSync - cuda2hipRename["cudaDeviceBlockingSync"] = {"hipDeviceBlockingSync", CONV_DEV}; + cuda2hipRename["cudaDeviceBlockingSync"] = {"hipDeviceBlockingSync", CONV_DEV, API_RUNTIME}; // unsupported yet - //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV}; - //cuda2hipRename["cudaDeviceScheduleMask"] = {"hipDeviceScheduleMask", CONV_DEV}; - cuda2hipRename["cudaDeviceMapHost"] = {"hipDeviceMapHost", CONV_DEV}; + //cuda2hipRename["cudaDeviceScheduleBlockingSync"] = {"hipDeviceScheduleBlockingSync", CONV_DEV, API_RUNTIME}; + //cuda2hipRename["cudaDeviceScheduleMask"] = {"hipDeviceScheduleMask", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceMapHost"] = {"hipDeviceMapHost", CONV_DEV, API_RUNTIME}; // unsupported yet - //cuda2hipRename["cudaDeviceLmemResizeToMax"] = {"hipDeviceLmemResizeToMax", CONV_DEV}; - //cuda2hipRename["cudaDeviceMask"] = {"hipDeviceMask", CONV_DEV}; + //cuda2hipRename["cudaDeviceLmemResizeToMax"] = {"hipDeviceLmemResizeToMax", CONV_DEV, API_RUNTIME}; + //cuda2hipRename["cudaDeviceMask"] = {"hipDeviceMask", CONV_DEV, API_RUNTIME}; // Cache config - cuda2hipRename["cudaDeviceSetCacheConfig"] = {"hipDeviceSetCacheConfig", CONV_DEV}; + cuda2hipRename["cudaDeviceSetCacheConfig"] = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME}; // translate deprecated - cuda2hipRename["cudaThreadSetCacheConfig"] = {"hipDeviceSetCacheConfig", CONV_DEV}; - cuda2hipRename["cudaDeviceGetCacheConfig"] = {"hipDeviceGetCacheConfig", CONV_DEV}; + cuda2hipRename["cudaThreadSetCacheConfig"] = {"hipDeviceSetCacheConfig", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceGetCacheConfig"] = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME}; // translate deprecated - cuda2hipRename["cudaThreadGetCacheConfig"] = {"hipDeviceGetCacheConfig", CONV_DEV}; - cuda2hipRename["cudaFuncCache"] = {"hipFuncCache", CONV_DEV}; - cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_DEV}; - cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV}; - cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_DEV}; - cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_DEV}; - cuda2hipRename["cudaFuncSetCacheConfig"] = {"hipFuncSetCacheConfig", CONV_DEV}; + cuda2hipRename["cudaThreadGetCacheConfig"] = {"hipDeviceGetCacheConfig", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncCache"] = {"hipFuncCache", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaFuncSetCacheConfig"] = {"hipFuncSetCacheConfig", CONV_DEV, API_RUNTIME}; // Driver/Runtime - cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV}; - cuda2hipRename["cudaGetDeviceCount"] = {"hipGetDeviceCount", CONV_DEV}; + cuda2hipRename["cudaDriverGetVersion"] = {"hipDriverGetVersion", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaGetDeviceCount"] = {"hipGetDeviceCount", CONV_DEV, API_RUNTIME}; // unsupported yet - //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV}; + //cuda2hipRename["cudaRuntimeGetVersion"] = {"hipRuntimeGetVersion", CONV_DEV, API_RUNTIME}; // Peer2Peer - cuda2hipRename["cudaDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV}; - cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV}; - cuda2hipRename["cudaDeviceEnablePeerAccess"] = {"hipDeviceEnablePeerAccess", CONV_DEV}; - cuda2hipRename["cudaMemcpyPeerAsync"] = {"hipMemcpyPeerAsync", CONV_MEM}; - cuda2hipRename["cudaMemcpyPeer"] = {"hipMemcpyPeer", CONV_MEM}; + cuda2hipRename["cudaDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceDisablePeerAccess"] = {"hipDeviceDisablePeerAccess", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceEnablePeerAccess"] = {"hipDeviceEnablePeerAccess", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaMemcpyPeerAsync"] = {"hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyPeer"] = {"hipMemcpyPeer", CONV_MEM, API_RUNTIME}; // Shared memory - cuda2hipRename["cudaDeviceSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV}; + cuda2hipRename["cudaDeviceSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME}; // translate deprecated - cuda2hipRename["cudaThreadSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV}; - cuda2hipRename["cudaDeviceGetSharedMemConfig"] = {"hipDeviceGetSharedMemConfig", CONV_DEV}; + cuda2hipRename["cudaThreadSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaDeviceGetSharedMemConfig"] = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME}; // translate deprecated - cuda2hipRename["cudaThreadGetSharedMemConfig"] = {"hipDeviceGetSharedMemConfig", CONV_DEV}; - cuda2hipRename["cudaSharedMemConfig"] = {"hipSharedMemConfig", CONV_DEV}; - cuda2hipRename["cudaSharedMemBankSizeDefault"] = {"hipSharedMemBankSizeDefault", CONV_DEV}; - cuda2hipRename["cudaSharedMemBankSizeFourByte"] = {"hipSharedMemBankSizeFourByte", CONV_DEV}; - cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV}; + cuda2hipRename["cudaThreadGetSharedMemConfig"] = {"hipDeviceGetSharedMemConfig", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaSharedMemConfig"] = {"hipSharedMemConfig", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaSharedMemBankSizeDefault"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaSharedMemBankSizeFourByte"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_RUNTIME}; + cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_RUNTIME}; // Profiler // unsupported yet - //cuda2hipRename["cudaProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER}; - cuda2hipRename["cudaProfilerStart"] = {"hipProfilerStart", CONV_OTHER}; - cuda2hipRename["cudaProfilerStop"] = {"hipProfilerStop", CONV_OTHER}; - cuda2hipRename["cudaChannelFormatDesc"] = {"hipChannelFormatDesc", CONV_TEX}; - cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX}; - cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX}; + //cuda2hipRename["cudaProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_RUNTIME}; + cuda2hipRename["cudaProfilerStart"] = {"hipProfilerStart", CONV_OTHER, API_RUNTIME}; + cuda2hipRename["cudaProfilerStop"] = {"hipProfilerStop", CONV_OTHER, API_RUNTIME}; + cuda2hipRename["cudaChannelFormatDesc"] = {"hipChannelFormatDesc", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaReadModeElementType"] = {"hipReadModeElementType", CONV_TEX, API_RUNTIME}; // Channel descriptor - cuda2hipRename["cudaCreateChannelDesc"] = {"hipCreateChannelDesc", CONV_TEX}; - cuda2hipRename["cudaBindTexture"] = {"hipBindTexture", CONV_TEX}; - cuda2hipRename["cudaUnbindTexture"] = {"hipUnbindTexture", CONV_TEX}; + cuda2hipRename["cudaCreateChannelDesc"] = {"hipCreateChannelDesc", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaBindTexture"] = {"hipBindTexture", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaUnbindTexture"] = {"hipUnbindTexture", CONV_TEX, API_RUNTIME}; //---------------------------------------BLAS-------------------------------------// // Blas types - cuda2hipRename["cublasHandle_t"] = {"hipblasHandle_t", CONV_BLAS}; + cuda2hipRename["cublasHandle_t"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // Blas operations - cuda2hipRename["cublasOperation_t"] = {"hipblasOperation_t", CONV_BLAS}; - cuda2hipRename["CUBLAS_OP_N"] = {"HIPBLAS_OP_N", CONV_BLAS}; - cuda2hipRename["CUBLAS_OP_T"] = {"HIPBLAS_OP_T", CONV_BLAS}; - cuda2hipRename["CUBLAS_OP_C"] = {"HIPBLAS_OP_C", CONV_BLAS}; + cuda2hipRename["cublasOperation_t"] = {"hipblasOperation_t", CONV_TYPE, API_BLAS}; + cuda2hipRename["CUBLAS_OP_N"] = {"HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_OP_T"] = {"HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_OP_C"] = {"HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas statuses - cuda2hipRename["cublasStatus_t"] = {"hipblasStatus_t", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_SUCCESS"] = {"HIPBLAS_STATUS_SUCCESS", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"] = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"] = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"] = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"] = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"] = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS}; - cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"] = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_BLAS}; + cuda2hipRename["cublasStatus_t"] = {"hipblasStatus_t", CONV_TYPE, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_SUCCESS"] = {"HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_NOT_INITIALIZED"] = {"HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_ALLOC_FAILED"] = {"HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_INVALID_VALUE"] = {"HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_MAPPING_ERROR"] = {"HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_EXECUTION_FAILED"] = {"HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_INTERNAL_ERROR"] = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS}; + cuda2hipRename["CUBLAS_STATUS_NOT_SUPPORTED"] = {"HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Fill Modes // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasFillMode_t"] = {"hipblasFillMode_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"] = {"HIPBLAS_FILL_MODE_LOWER", CONV_BLAS}; - //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"] = {"HIPBLAS_FILL_MODE_UPPER", CONV_BLAS}; + //cuda2hipRename["cublasFillMode_t"] = {"hipblasFillMode_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_FILL_MODE_LOWER"] = {"HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_FILL_MODE_UPPER"] = {"HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Diag Types // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasDiagType_t"] = {"hipblasDiagType_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"] = {"HIPBLAS_DIAG_NON_UNIT", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DIAG_UNIT"] = {"HIPBLAS_DIAG_UNIT", CONV_BLAS}; + //cuda2hipRename["cublasDiagType_t"] = {"hipblasDiagType_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_DIAG_NON_UNIT"] = {"HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_DIAG_UNIT"] = {"HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Side Modes // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasSideMode_t"] = {"hipblasSideMode_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_SIDE_LEFT"] = {"HIPBLAS_SIDE_LEFT", CONV_BLAS}; - //cuda2hipRename["CUBLAS_SIDE_RIGHT"] = {"HIPBLAS_SIDE_RIGHT", CONV_BLAS}; + //cuda2hipRename["cublasSideMode_t"] = {"hipblasSideMode_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_SIDE_LEFT"] = {"HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_SIDE_RIGHT"] = {"HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Pointer Modes // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasPointerMode_t"] = {"hipblasPointerMode_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"] = {"HIPBLAS_POINTER_MODE_HOST", CONV_BLAS}; - //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"] = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_BLAS}; + //cuda2hipRename["cublasPointerMode_t"] = {"hipblasPointerMode_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_POINTER_MODE_HOST"] = {"HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_POINTER_MODE_DEVICE"] = {"HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Atomics Modes // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasAtomicsMode_t"] = {"hipblasAtomicsMode_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"] = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_BLAS}; - //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"] = {"HIPBLAS_ATOMICS_ALLOWED", CONV_BLAS}; + //cuda2hipRename["cublasAtomicsMode_t"] = {"hipblasAtomicsMode_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_ATOMICS_NOT_ALLOWED"] = {"HIPBLAS_ATOMICS_NOT_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_ATOMICS_ALLOWED"] = {"HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas Data Type // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasDataType_t"] = {"hipblasDataType_t", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DATA_FLOAT"] = {"HIPBLAS_DATA_FLOAT", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DATA_DOUBLE"] = {"HIPBLAS_DATA_DOUBLE", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DATA_HALF"] = {"HIPBLAS_DATA_HALF", CONV_BLAS}; - //cuda2hipRename["CUBLAS_DATA_INT8"] = {"HIPBLAS_DATA_INT8", CONV_BLAS}; + //cuda2hipRename["cublasDataType_t"] = {"hipblasDataType_t", CONV_TYPE, API_BLAS}; + //cuda2hipRename["CUBLAS_DATA_FLOAT"] = {"HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_DATA_DOUBLE"] = {"HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_DATA_HALF"] = {"HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS}; + //cuda2hipRename["CUBLAS_DATA_INT8"] = {"HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS}; // Blas1 (v1) Routines - cuda2hipRename["cublasCreate"] = {"hipblasCreate", CONV_BLAS}; - cuda2hipRename["cublasDestroy"] = {"hipblasDestroy", CONV_BLAS}; + cuda2hipRename["cublasCreate"] = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDestroy"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS}; - cuda2hipRename["cublasSetVector"] = {"hipblasSetVector", CONV_BLAS}; - cuda2hipRename["cublasGetVector"] = {"hipblasGetVector", CONV_BLAS}; - cuda2hipRename["cublasSetMatrix"] = {"hipblasSetMatrix", CONV_BLAS}; - cuda2hipRename["cublasGetMatrix"] = {"hipblasGetMatrix", CONV_BLAS}; + cuda2hipRename["cublasSetVector"] = {"hipblasSetVector", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasGetVector"] = {"hipblasGetVector", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasSetMatrix"] = {"hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasGetMatrix"] = {"hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_BLAS}; - //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_BLAS}; + //cuda2hipRename["cublasGetMatrixAsync"] = {"hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasSetMatrixAsync"] = {"hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS}; // NRM2 - //cuda2hipRename["cublasSnrm2"] = {"hipblasSnrm2", CONV_BLAS}; - //cuda2hipRename["cublasDnrm2"] = {"hipblasDnrm2", CONV_BLAS}; - //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_BLAS}; - //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_BLAS}; + //cuda2hipRename["cublasSnrm2"] = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDnrm2"] = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasScnrm2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDznrm2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS}; // DOT - cuda2hipRename["cublasSdot"] = {"hipblasSdot", CONV_BLAS}; + cuda2hipRename["cublasSdot"] = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_BLAS}; - cuda2hipRename["cublasDdot"] = {"hipblasDdot", CONV_BLAS}; + cuda2hipRename["cublasSdotBatched"] = {"hipblasSdotBatched",CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDdot"] = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_BLAS}; - //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_BLAS}; - //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_BLAS}; - //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_BLAS}; - //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_BLAS}; + cuda2hipRename["cublasDdotBatched"] = {"hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCdotu"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCdotc"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdotu"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdotc"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS}; // SCAL - cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_BLAS}; + cuda2hipRename["cublasSscal"] = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_BLAS}; - cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_BLAS}; + cuda2hipRename["cublasSscalBatched"] = {"hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDscal"] = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_BLAS}; - //cuda2hipRename["cublasCscal"] = {"hipblasCscal", CONV_BLAS}; - //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_BLAS}; - //cuda2hipRename["cublasZscal"] = {"hipblasZscal", CONV_BLAS}; - //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_BLAS}; + cuda2hipRename["cublasDscalBatched"] = {"hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCscal"] = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsscal"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZscal"] = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdscal"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS}; // AXPY - cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_BLAS}; + cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_BLAS}; - //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_BLAS}; - //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_BLAS}; - //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_BLAS}; + cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS}; // COPY - cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_BLAS}; + cuda2hipRename["cublasScopy"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_BLAS}; - cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_BLAS}; + cuda2hipRename["cublasScopyBatched"] = {"hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDcopy"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_BLAS}; - //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_BLAS}; - //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_BLAS}; + cuda2hipRename["cublasDcopyBatched"] = {"hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCcopy"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZcopy"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS}; // SWAP - //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_BLAS}; - //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_BLAS}; - //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_BLAS}; - //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_BLAS}; + //cuda2hipRename["cublasSswap"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDswap"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCswap"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZswap"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS}; // AMAX - //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_BLAS}; - //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_BLAS}; - //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_BLAS}; - //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_BLAS}; + //cuda2hipRename["cublasIsamax"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIdamax"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIcamax"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIzamax"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS}; // AMIN - //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_BLAS}; - //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_BLAS}; - //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_BLAS}; - //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_BLAS}; + //cuda2hipRename["cublasIsamin"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIdamin"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIcamin"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIzamin"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS}; // ASUM - cuda2hipRename["cublasSasum"] = {"hipblasSasum", CONV_BLAS}; + cuda2hipRename["cublasSasum"] = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_BLAS}; - cuda2hipRename["cublasDasum"] = {"hipblasDasum", CONV_BLAS}; + cuda2hipRename["cublasSasumBatched"] = {"hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDasum"] = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_BLAS}; - //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_BLAS}; - //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_BLAS}; + cuda2hipRename["cublasDasumBatched"] = {"hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasScasum"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDzasum"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS}; // ROT - //cuda2hipRename["cublasSrot"] = {"hipblasSrot", CONV_BLAS}; - //cuda2hipRename["cublasDrot"] = {"hipblasDrot", CONV_BLAS}; - //cuda2hipRename["cublasCrot"] = {"hipblasCrot", CONV_BLAS}; - //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_BLAS}; - //cuda2hipRename["cublasZrot"] = {"hipblasZrot", CONV_BLAS}; - //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_BLAS}; + //cuda2hipRename["cublasSrot"] = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrot"] = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCrot"] = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsrot"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZrot"] = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdrot"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS}; // ROTG - //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_BLAS}; - //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_BLAS}; - //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_BLAS}; - //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_BLAS}; + //cuda2hipRename["cublasSrotg"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotg"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCrotg"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZrotg"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS}; // ROTM - //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_BLAS}; - //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_BLAS}; + //cuda2hipRename["cublasSrotm"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotm"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS}; // ROTMG - //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_BLAS}; - //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_BLAS}; + //cuda2hipRename["cublasSrotmg"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotmg"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS}; // GEMV - cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_BLAS}; + cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA - cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_BLAS}; - //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_BLAS}; - //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_BLAS}; - //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_BLAS}; + cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS}; // GBMV - //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_BLAS}; - //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_BLAS}; - //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_BLAS}; - //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_BLAS}; + //cuda2hipRename["cublasSgbmv"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgbmv"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgbmv"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgbmv"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS}; // TRMV - //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_BLAS}; - //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_BLAS}; - //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_BLAS}; - //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_BLAS}; + //cuda2hipRename["cublasStrmv"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrmv"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrmv"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrmv"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS}; // TBMV - //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_BLAS}; - //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_BLAS}; - //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_BLAS}; - //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_BLAS}; + //cuda2hipRename["cublasStbmv"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtbmv"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtbmv"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtbmv"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS}; // TPMV - //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_BLAS}; - //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_BLAS}; - //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_BLAS}; - //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_BLAS}; + //cuda2hipRename["cublasStpmv"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtpmv"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtpmv"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtpmv"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS}; // TRSV - //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_BLAS}; - //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_BLAS}; - //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_BLAS}; - //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_BLAS}; + //cuda2hipRename["cublasStrsv"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrsv"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrsv"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrsv"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS}; // TPSV - //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_BLAS}; - //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_BLAS}; - //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_BLAS}; - //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_BLAS}; + //cuda2hipRename["cublasStpsv"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtpsv"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtpsv"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtpsv"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS}; // TBSV - //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_BLAS}; - //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_BLAS}; - //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_BLAS}; - //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_BLAS}; + //cuda2hipRename["cublasStbsv"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtbsv"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtbsv"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtbsv"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS}; // SYMV/HEMV - //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_BLAS}; - //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_BLAS}; - //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_BLAS}; - //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_BLAS}; - //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_BLAS}; - //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_BLAS}; + //cuda2hipRename["cublasSsymv"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsymv"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsymv"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsymv"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChemv"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhemv"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS}; // SBMV/HBMV - //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_BLAS}; - //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_BLAS}; - //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_BLAS}; - //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_BLAS}; + //cuda2hipRename["cublasSsbmv"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsbmv"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChbmv"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhbmv"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS}; // SPMV/HPMV - //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_BLAS}; - //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_BLAS}; - //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_BLAS}; - //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_BLAS}; + //cuda2hipRename["cublasSspmv"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspmv"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpmv"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpmv"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS}; // GER - cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_BLAS}; - //cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_BLAS}; - //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_BLAS}; - //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_BLAS}; - //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_BLAS}; - //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_BLAS}; + cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgerc"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS}; // SYR/HER - //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_BLAS}; - //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_BLAS}; - //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_BLAS}; - //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_BLAS}; + //cuda2hipRename["cublasSsyr"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCher"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS}; // SPR/HPR - //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_BLAS}; - //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_BLAS}; - //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_BLAS}; - //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_BLAS}; + //cuda2hipRename["cublasSspr"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspr"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpr"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpr"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS}; // SYR2/HER2 - //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_BLAS}; - //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_BLAS}; - //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_BLAS}; - //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_BLAS}; + //cuda2hipRename["cublasSsyr2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCher2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS}; // SPR2/HPR2 - //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_BLAS}; - //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_BLAS}; - //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_BLAS}; - //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_BLAS}; + //cuda2hipRename["cublasSspr2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspr2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpr2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpr2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS}; // Blas3 (v1) Routines // GEMM - cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_BLAS}; - //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_BLAS}; - cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_BLAS}; - //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_BLAS}; + cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgemm"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS}; // BATCH GEMM - cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_BLAS}; - //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_BLAS}; - cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_BLAS}; - //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_BLAS}; + cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgemmBatched"] = {"hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS}; // SYRK - //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_BLAS}; - //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_BLAS}; - //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_BLAS}; - //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_BLAS}; + //cuda2hipRename["cublasSsyrk"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyrk"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyrk"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyrk"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS}; // HERK - //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_BLAS}; - //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_BLAS}; + //cuda2hipRename["cublasCherk"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZherk"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS}; // SYR2K - //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_BLAS}; + //cuda2hipRename["cublasSsyr2k"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr2k"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyr2k"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyr2k"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS}; // SYRKX - eXtended SYRK // cublasSsyrkx @@ -652,40 +663,40 @@ struct cuda2hipMap { // cublasZsyrkx // HER2K - //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_BLAS}; - //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_BLAS}; + //cuda2hipRename["cublasCher2k"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher2k"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS}; // HERKX - eXtended HERK // cublasCherkx // cublasZherkx // SYMM - //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_BLAS}; - //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_BLAS}; - //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_BLAS}; - //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_BLAS}; + //cuda2hipRename["cublasSsymm"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsymm"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsymm"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsymm"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS}; // HEMM - //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_BLAS}; - //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_BLAS}; + //cuda2hipRename["cublasChemm"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhemm"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS}; // TRSM - //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_BLAS}; - //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_BLAS}; - //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_BLAS}; - //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_BLAS}; + //cuda2hipRename["cublasStrsm"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrsm"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrsm"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrsm"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS}; // TRSM - Batched Triangular Solver - //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_BLAS}; - //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_BLAS}; - //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_BLAS}; - //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_BLAS}; + //cuda2hipRename["cublasStrsmBatched"] = {"hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrsmBatched"] = {"hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrsmBatched"] = {"hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrsmBatched"] = {"hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS}; // TRMM - //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_BLAS}; - //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_BLAS}; - //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_BLAS}; - //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_BLAS}; + //cuda2hipRename["cublasStrmm"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrmm"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrmm"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrmm"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS}; // TO SUPPORT OR NOT? (cublas_api.h) @@ -758,256 +769,257 @@ struct cuda2hipMap { // cublasZtrttp // Blas2 (v2) Routines - cuda2hipRename["cublasCreate_v2"] = {"hipblasCreate", CONV_BLAS}; - cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_BLAS}; + cuda2hipRename["cublasCreate_v2"] = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - //cuda2hipRename["cublasGetVersion_v2"] = {"hipblasGetVersion", CONV_BLAS}; - //cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_BLAS}; - //cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_BLAS}; - //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_BLAS}; - //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_BLAS}; + //cuda2hipRename["cublasGetVersion_v2"] = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS}; // GEMV - cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_BLAS}; - //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_BLAS}; - //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_BLAS}; - //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_BLAS}; + cuda2hipRename["cublasSgemv_v2"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgemv_v2"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgemv_v2"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgemv_v2"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS}; // GBMV - //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_BLAS}; - //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_BLAS}; - //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_BLAS}; - //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_BLAS}; + //cuda2hipRename["cublasSgbmv_v2"] = {"hipblasSgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgbmv_v2"] = {"hipblasDgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgbmv_v2"] = {"hipblasCgbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgbmv_v2"] = {"hipblasZgbmv", CONV_MATH_FUNC, API_BLAS}; // TRMV - //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_BLAS}; - //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_BLAS}; - //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_BLAS}; - //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_BLAS}; + //cuda2hipRename["cublasStrmv_v2"] = {"hipblasStrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrmv_v2"] = {"hipblasDtrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrmv_v2"] = {"hipblasCtrmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrmv_v2"] = {"hipblasZtrmv", CONV_MATH_FUNC, API_BLAS}; // TBMV - //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_BLAS}; - //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_BLAS}; - //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_BLAS}; - //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_BLAS}; + //cuda2hipRename["cublasStbmv_v2"] = {"hipblasStbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtbmv_v2"] = {"hipblasDtbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtbmv_v2"] = {"hipblasCtbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtbmv_v2"] = {"hipblasZtbmv", CONV_MATH_FUNC, API_BLAS}; // TPMV - //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_BLAS}; - //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_BLAS}; - //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_BLAS}; - //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_BLAS}; + //cuda2hipRename["cublasStpmv_v2"] = {"hipblasStpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtpmv_v2"] = {"hipblasDtpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtpmv_v2"] = {"hipblasCtpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtpmv_v2"] = {"hipblasZtpmv", CONV_MATH_FUNC, API_BLAS}; // TRSV - //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_BLAS}; - //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_BLAS}; - //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_BLAS}; - //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_BLAS}; + //cuda2hipRename["cublasStrsv_v2"] = {"hipblasStrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrsv_v2"] = {"hipblasDtrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrsv_v2"] = {"hipblasCtrsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrsv_v2"] = {"hipblasZtrsv", CONV_MATH_FUNC, API_BLAS}; // TPSV - //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_BLAS}; - //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_BLAS}; - //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_BLAS}; - //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_BLAS}; + //cuda2hipRename["cublasStpsv_v2"] = {"hipblasStpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtpsv_v2"] = {"hipblasDtpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtpsv_v2"] = {"hipblasCtpsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtpsv_v2"] = {"hipblasZtpsv", CONV_MATH_FUNC, API_BLAS}; // TBSV - //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_BLAS}; - //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_BLAS}; - //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_BLAS}; - //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_BLAS}; + //cuda2hipRename["cublasStbsv_v2"] = {"hipblasStbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtbsv_v2"] = {"hipblasDtbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtbsv_v2"] = {"hipblasCtbsv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtbsv_v2"] = {"hipblasZtbsv", CONV_MATH_FUNC, API_BLAS}; // SYMV/HEMV - //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_BLAS}; - //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_BLAS}; - //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_BLAS}; - //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_BLAS}; - //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_BLAS}; - //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_BLAS}; + //cuda2hipRename["cublasSsymv_v2"] = {"hipblasSsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsymv_v2"] = {"hipblasDsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsymv_v2"] = {"hipblasCsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsymv_v2"] = {"hipblasZsymv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChemv_v2"] = {"hipblasChemv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhemv_v2"] = {"hipblasZhemv", CONV_MATH_FUNC, API_BLAS}; // SBMV/HBMV - //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_BLAS}; - //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_BLAS}; - //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_BLAS}; - //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_BLAS}; + //cuda2hipRename["cublasSsbmv_v2"] = {"hipblasSsbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsbmv_v2"] = {"hpiblasDsbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChbmv_v2"] = {"hipblasChbmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhbmv_v2"] = {"hipblasZhbmv", CONV_MATH_FUNC, API_BLAS}; // SPMV/HPMV - //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_BLAS}; - //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_BLAS}; - //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_BLAS}; - //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_BLAS}; + //cuda2hipRename["cublasSspmv_v2"] = {"hipblasSspmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspmv_v2"] = {"hipblasDspmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpmv_v2"] = {"hipblasChpmv", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpmv_v2"] = {"hipblasZhpmv", CONV_MATH_FUNC, API_BLAS}; // GER - cuda2hipRename["cublasSger_v2"] = {"hipblasSger", CONV_BLAS}; - //cuda2hipRename["cublasDger_v2"] = {"hipblasDger", CONV_BLAS}; - //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_BLAS}; - //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_BLAS}; - //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_BLAS}; - //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_BLAS}; + cuda2hipRename["cublasSger_v2"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDger_v2"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgeru_v2"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCgerc_v2"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgeru_v2"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgerc_v2"] = {"hipblasZgerc", CONV_MATH_FUNC, API_BLAS}; // SYR/HER - //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_BLAS}; - //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_BLAS}; - //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_BLAS}; - //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_BLAS}; - //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_BLAS}; - //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_BLAS}; + //cuda2hipRename["cublasSsyr_v2"] = {"hipblasSsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr_v2"] = {"hipblasDsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyr_v2"] = {"hipblasCsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyr_v2"] = {"hipblasZsyr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCher_v2"] = {"hipblasCher", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher_v2"] = {"hipblasZher", CONV_MATH_FUNC, API_BLAS}; // SPR/HPR - //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_BLAS}; - //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_BLAS}; - //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_BLAS}; - //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_BLAS}; + //cuda2hipRename["cublasSspr_v2"] = {"hipblasSspr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspr_v2"] = {"hipblasDspr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpr_v2"] = {"hipblasChpr", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpr_v2"] = {"hipblasZhpr", CONV_MATH_FUNC, API_BLAS}; // SYR2/HER2 - //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_BLAS}; - //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_BLAS}; - //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_BLAS}; - //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_BLAS}; - //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_BLAS}; - //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_BLAS}; + //cuda2hipRename["cublasSsyr2_v2"] = {"hipblasSsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr2_v2"] = {"hipblasDsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyr2_v2"] = {"hipblasCsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyr2_v2"] = {"hipblasZsyr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCher2_v2"] = {"hipblasCher2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher2_v2"] = {"hipblasZher2", CONV_MATH_FUNC, API_BLAS}; // SPR2/HPR2 - //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_BLAS}; - //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_BLAS}; - //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_BLAS}; - //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_BLAS}; + //cuda2hipRename["cublasSspr2_v2"] = {"hipblasSspr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDspr2_v2"] = {"hipblasDspr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasChpr2_v2"] = {"hipblasChpr2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhpr2_v2"] = {"hipblasZhpr2", CONV_MATH_FUNC, API_BLAS}; // Blas3 (v2) Routines // GEMM - cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_BLAS}; - //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_BLAS}; - cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_BLAS}; - //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_BLAS}; + cuda2hipRename["cublasSgemm_v2"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDgemm_v2"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasCgemm_v2"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZgemm_v2"] = {"hipblasZgemm", CONV_MATH_FUNC, API_BLAS}; //IO in FP16 / FP32, computation in float // cublasSgemmEx // SYRK - //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_BLAS}; - //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_BLAS}; - //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_BLAS}; - //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_BLAS}; + //cuda2hipRename["cublasSsyrk_v2"] = {"hipblasSsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyrk_v2"] = {"hipblasDsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyrk_v2"] = {"hipblasCsyrk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyrk_v2"] = {"hipblasZsyrk", CONV_MATH_FUNC, API_BLAS}; // HERK - //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_BLAS}; - //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_BLAS}; + //cuda2hipRename["cublasCherk_v2"] = {"hipblasCherk", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZherk_v2"] = {"hipblasZherk", CONV_MATH_FUNC, API_BLAS}; // SYR2K - //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_BLAS}; - //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_BLAS}; + //cuda2hipRename["cublasSsyr2k_v2"] = {"hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsyr2k_v2"] = {"hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsyr2k_v2"] = {"hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsyr2k_v2"] = {"hipblasZsyr2k", CONV_MATH_FUNC, API_BLAS}; // HER2K - //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_BLAS}; - //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_BLAS}; + //cuda2hipRename["cublasCher2k_v2"] = {"hipblasCher2k", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZher2k_v2"] = {"hipblasZher2k", CONV_MATH_FUNC, API_BLAS}; // SYMM - //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_BLAS}; - //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_BLAS}; - //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_BLAS}; - //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_BLAS}; + //cuda2hipRename["cublasSsymm_v2"] = {"hipblasSsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDsymm_v2"] = {"hipblasDsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsymm_v2"] = {"hipblasCsymm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZsymm_v2"] = {"hipblasZsymm", CONV_MATH_FUNC, API_BLAS}; // HEMM - //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_BLAS}; - //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_BLAS}; + //cuda2hipRename["cublasChemm_v2"] = {"hipblasChemm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZhemm_v2"] = {"hipblasZhemm", CONV_MATH_FUNC, API_BLAS}; // TRSM - //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_BLAS}; - //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_BLAS}; - //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_BLAS}; - //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_BLAS}; + //cuda2hipRename["cublasStrsm_v2"] = {"hipblasStrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrsm_v2"] = {"hipblasDtrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrsm_v2"] = {"hipblasCtrsm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrsm_v2"] = {"hipblasZtrsm", CONV_MATH_FUNC, API_BLAS}; // TRMM - //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_BLAS}; - //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_BLAS}; - //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_BLAS}; - //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_BLAS}; + //cuda2hipRename["cublasStrmm_v2"] = {"hipblasStrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDtrmm_v2"] = {"hipblasDtrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCtrmm_v2"] = {"hipblasCtrmm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZtrmm_v2"] = {"hipblasZtrmm", CONV_MATH_FUNC, API_BLAS}; // NRM2 - //cuda2hipRename["cublasSnrm2_v2"] = {"hipblasSnrm2", CONV_BLAS}; - //cuda2hipRename["cublasDnrm2_v2"] = {"hipblasDnrm2", CONV_BLAS}; - //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_BLAS}; - //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_BLAS}; + //cuda2hipRename["cublasSnrm2_v2"] = {"hipblasSnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDnrm2_v2"] = {"hipblasDnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasScnrm2_v2"] = {"hipblasScnrm2", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDznrm2_v2"] = {"hipblasDznrm2", CONV_MATH_FUNC, API_BLAS}; // DOT - cuda2hipRename["cublasSdot_v2"] = {"hipblasSdot", CONV_BLAS}; - cuda2hipRename["cublasDdot_v2"] = {"hipblasDdot", CONV_BLAS}; - //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_BLAS}; - //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_BLAS}; - //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_BLAS}; - //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_BLAS}; + cuda2hipRename["cublasSdot_v2"] = {"hipblasSdot", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDdot_v2"] = {"hipblasDdot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCdotu_v2"] = {"hipblasCdotu", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCdotc_v2"] = {"hipblasCdotc", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdotu_v2"] = {"hipblasZdotu", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdotc_v2"] = {"hipblasZdotc", CONV_MATH_FUNC, API_BLAS}; // SCAL - cuda2hipRename["cublasSscal_v2"] = {"hipblasSscal", CONV_BLAS}; - cuda2hipRename["cublasDscal_v2"] = {"hipblasDscal", CONV_BLAS}; - //cuda2hipRename["cublasCscal_v2"] = {"hipblasCscal", CONV_BLAS}; - //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_BLAS}; - //cuda2hipRename["cublasZscal_v2"] = {"hipblasZscal", CONV_BLAS}; - //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_BLAS}; + cuda2hipRename["cublasSscal_v2"] = {"hipblasSscal", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDscal_v2"] = {"hipblasDscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCscal_v2"] = {"hipblasCscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsscal_v2"] = {"hipblasCsscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZscal_v2"] = {"hipblasZscal", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdscal_v2"] = {"hipblasZdscal", CONV_MATH_FUNC, API_BLAS}; // AXPY - cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_BLAS}; - //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_BLAS}; - //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_BLAS}; - //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_BLAS}; + cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS}; // COPY - cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_BLAS}; - cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_BLAS}; - //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_BLAS}; - //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_BLAS}; + cuda2hipRename["cublasScopy_v2"] = {"hipblasScopy", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDcopy_v2"] = {"hipblasDcopy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCcopy_v2"] = {"hipblasCcopy", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZcopy_v2"] = {"hipblasZcopy", CONV_MATH_FUNC, API_BLAS}; // SWAP - //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_BLAS}; - //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_BLAS}; - //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_BLAS}; - //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_BLAS}; + //cuda2hipRename["cublasSswap_v2"] = {"hipblasSswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDswap_v2"] = {"hipblasDswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCswap_v2"] = {"hipblasCswap", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZswap_v2"] = {"hipblasZswap", CONV_MATH_FUNC, API_BLAS}; // AMAX - //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_BLAS}; - //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_BLAS}; - //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_BLAS}; - //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_BLAS}; + //cuda2hipRename["cublasIsamax_v2"] = {"hipblasIsamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIdamax_v2"] = {"hipblasIdamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIcamax_v2"] = {"hipblasIcamax", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIzamax_v2"] = {"hipblasIzamax", CONV_MATH_FUNC, API_BLAS}; // AMIN - //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_BLAS}; - //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_BLAS}; - //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_BLAS}; - //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_BLAS}; + //cuda2hipRename["cublasIsamin_v2"] = {"hipblasIsamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIdamin_v2"] = {"hipblasIdamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIcamin_v2"] = {"hipblasIcamin", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasIzamin_v2"] = {"hipblasIzamin", CONV_MATH_FUNC, API_BLAS}; // ASUM - cuda2hipRename["cublasSasum_v2"] = {"hipblasSasum", CONV_BLAS}; - cuda2hipRename["cublasDasum_v2"] = {"hipblasDasum", CONV_BLAS}; - //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_BLAS}; - //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_BLAS}; + cuda2hipRename["cublasSasum_v2"] = {"hipblasSasum", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDasum_v2"] = {"hipblasDasum", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasScasum_v2"] = {"hipblasScasum", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDzasum_v2"] = {"hipblasDzasum", CONV_MATH_FUNC, API_BLAS}; // ROT - //cuda2hipRename["cublasSrot_v2"] = {"hipblasSrot", CONV_BLAS}; - //cuda2hipRename["cublasDrot_v2"] = {"hipblasDrot", CONV_BLAS}; - //cuda2hipRename["cublasCrot_v2"] = {"hipblasCrot", CONV_BLAS}; - //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_BLAS}; - //cuda2hipRename["cublasZrot_v2"] = {"hipblasZrot", CONV_BLAS}; - //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_BLAS}; + //cuda2hipRename["cublasSrot_v2"] = {"hipblasSrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrot_v2"] = {"hipblasDrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCrot_v2"] = {"hipblasCrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCsrot_v2"] = {"hipblasCsrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZrot_v2"] = {"hipblasZrot", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZdrot_v2"] = {"hipblasZdrot", CONV_MATH_FUNC, API_BLAS}; // ROTG - //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_BLAS}; - //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_BLAS}; - //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_BLAS}; - //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_BLAS}; + //cuda2hipRename["cublasSrotg_v2"] = {"hipblasSrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotg_v2"] = {"hipblasDrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasCrotg_v2"] = {"hipblasCrotg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasZrotg_v2"] = {"hipblasZrotg", CONV_MATH_FUNC, API_BLAS}; // ROTM - //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_BLAS}; - //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_BLAS}; + //cuda2hipRename["cublasSrotm_v2"] = {"hipblasSrotm", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotm_v2"] = {"hipblasDrotm", CONV_MATH_FUNC, API_BLAS}; // ROTMG - //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_BLAS}; - //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_BLAS}; + //cuda2hipRename["cublasSrotmg_v2"] = {"hipblasSrotmg", CONV_MATH_FUNC, API_BLAS}; + //cuda2hipRename["cublasDrotmg_v2"] = {"hipblasDrotmg", CONV_MATH_FUNC, API_BLAS}; } struct HipNames { StringRef hipName; ConvTypes countType; + ApiTypes countApiType; }; SmallDenseMap cuda2hipRename; @@ -1023,7 +1035,8 @@ StringRef unquoteStr(StringRef s) { static void processString(StringRef s, const cuda2hipMap &map, Replacements *Replace, SourceManager &SM, SourceLocation start, - int64_t countReps[ConvTypes::CONV_LAST]) { + int64_t countReps[CONV_LAST], + int64_t countApiReps[API_LAST]) { size_t begin = 0; while ((begin = s.find("cuda", begin)) != StringRef::npos || (begin = s.find("cublas", begin)) != StringRef::npos) { @@ -1031,8 +1044,9 @@ static void processString(StringRef s, const cuda2hipMap &map, StringRef name = s.slice(begin, end); const auto found = map.cuda2hipRename.find(name); if (found != map.cuda2hipRename.end()) { - countReps[CONV_LITERAL]++; StringRef repName = found->second.hipName; + countReps[CONV_LITERAL]++; + countApiReps[API_RUNTIME]++; SourceLocation sl = start.getLocWithOffset(begin + 1); Replacement Rep(SM, sl, name.size(), repName); Replace->insert(Rep); @@ -1074,8 +1088,9 @@ public: if (is_angled) { const auto found = N.cuda2hipRename.find(file_name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; DEBUG(dbgs() << "Include file found: " << file_name << "\n" << "SourceLocation:" << filename_range.getBegin().printToString(*_sm) << "\n" @@ -1102,8 +1117,9 @@ public: StringRef name = T.getIdentifierInfo()->getName(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; SourceLocation sl = T.getLocation(); DEBUG(dbgs() << "Identifier " << name << " found in definition of macro " @@ -1148,8 +1164,9 @@ public: StringRef name = tok.getIdentifierInfo()->getName(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; DEBUG(dbgs() << "Identifier " << name << " found as an actual argument in expansion of macro " @@ -1170,8 +1187,9 @@ public: const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { sl = sl_macro; - countReps[found->second.countType]++; StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; Replacement Rep(*_sm, sl, length, repName); Replace->insert(Rep); } @@ -1179,7 +1197,7 @@ public: if (tok.is(tok::string_literal)) { StringRef s(tok.getLiteralData(), tok.getLength()); processString(unquoteStr(s), N, Replace, *_sm, tok.getLocation(), - countReps); + countReps, countApiReps); } } } @@ -1195,7 +1213,8 @@ public: void setSourceManager(SourceManager *sm) { _sm = sm; } void setPreprocessor(Preprocessor *pp) { _pp = pp; } void setMatch(Cuda2HipCallback *match) { Match = match; } - int64_t countReps[ConvTypes::CONV_LAST] = {0}; + int64_t countReps[CONV_LAST] = { 0 }; + int64_t countApiReps[API_LAST] = { 0 }; private: SourceManager *_sm; @@ -1206,65 +1225,49 @@ private: }; class Cuda2HipCallback : public MatchFinder::MatchCallback { -public: - Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks) - : Replace(Replace), owner(parent), PP(PPCallbacks) { - PP->setMatch(this); - } - - void convertKernelDecl(const FunctionDecl *kernelDecl, - const MatchFinder::MatchResult &Result) { +private: + void convertKernelDecl(const FunctionDecl *kernelDecl, const MatchFinder::MatchResult &Result) { SourceManager *SM = Result.SourceManager; LangOptions DefaultLangOptions; - SmallString<40> XStr; raw_svector_ostream OS(XStr); StringRef initialParamList; OS << "hipLaunchParm lp"; - size_t replacementLength = OS.str().size(); + size_t repLength = OS.str().size(); SourceLocation sl = kernelDecl->getNameInfo().getEndLoc(); SourceLocation kernelArgListStart = Lexer::findLocationAfterToken( - sl, tok::l_paren, *SM, DefaultLangOptions, true); + sl, tok::l_paren, *SM, DefaultLangOptions, true); DEBUG(dbgs() << kernelArgListStart.printToString(*SM)); if (kernelDecl->getNumParams() > 0) { const ParmVarDecl *pvdFirst = kernelDecl->getParamDecl(0); - const ParmVarDecl *pvdLast = - kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1); + const ParmVarDecl *pvdLast = kernelDecl->getParamDecl(kernelDecl->getNumParams() - 1); SourceLocation kernelArgListStart(pvdFirst->getLocStart()); SourceLocation kernelArgListEnd(pvdLast->getLocEnd()); - SourceLocation stop = Lexer::getLocForEndOfToken( - kernelArgListEnd, 0, *SM, DefaultLangOptions); - replacementLength += - SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); - initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), - replacementLength); + SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions); + repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); + initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength); OS << ", " << initialParamList; } - DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" - << "new paramlist: " << OS.str() << "\n"); - Replacement Rep0(*(Result.SourceManager), kernelArgListStart, - replacementLength, OS.str()); + DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n"); + Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str()); Replace->insert(Rep0); } - void run(const MatchFinder::MatchResult &Result) override { - SourceManager *SM = Result.SourceManager; - LangOptions DefaultLangOptions; - - if (const CallExpr *call = - Result.Nodes.getNodeAs("cudaCall")) { + bool cudaCall(const MatchFinder::MatchResult &Result) { + if (const CallExpr *call = Result.Nodes.getNodeAs("cudaCall")) { const FunctionDecl *funcDcl = call->getDirectCallee(); StringRef name = funcDcl->getDeclName().getAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { + SourceManager *SM = Result.SourceManager; StringRef repName = found->second.hipName; SourceLocation sl = call->getLocStart(); size_t length = name.size(); bool bReplace = true; if (SM->isMacroArgExpansion(sl)) { sl = SM->getImmediateSpellingLoc(sl); - } - else if (SM->isMacroBodyExpansion(sl)) { + } else if (SM->isMacroBodyExpansion(sl)) { + LangOptions DefaultLangOptions; SourceLocation sl_macro = SM->getExpansionLoc(sl); SourceLocation sl_end = Lexer::getLocForEndOfToken(sl_macro, 0, *SM, DefaultLangOptions); length = SM->getCharacterData(sl_end) - SM->getCharacterData(sl_macro); @@ -1277,14 +1280,18 @@ public: } if (bReplace) { countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; Replacement Rep(*SM, sl, length, repName); Replace->insert(Rep); } } + return true; } + return false; + } - if (const CUDAKernelCallExpr *launchKernel = - Result.Nodes.getNodeAs("cudaLaunchKernel")) { + bool cudaLaunchKernel(const MatchFinder::MatchResult &Result) { + if (const CUDAKernelCallExpr *launchKernel = Result.Nodes.getNodeAs("cudaLaunchKernel")) { SmallString<40> XStr; raw_svector_ostream OS(XStr); StringRef calleeName; @@ -1295,78 +1302,66 @@ public: } else { const Expr *e = launchKernel->getCallee(); if (const UnresolvedLookupExpr *ule = - dyn_cast(e)) { + dyn_cast(e)) { calleeName = ule->getName().getAsIdentifierInfo()->getName(); owner->addMatcher(functionTemplateDecl(hasName(calleeName)) - .bind("unresolvedTemplateName"), - this); + .bind("unresolvedTemplateName"), + this); } } - XStr.clear(); OS << "hipLaunchKernel(HIP_KERNEL_NAME(" << calleeName << "),"; - const CallExpr *config = launchKernel->getConfig(); - DEBUG(dbgs() << "Kernel config arguments:" - << "\n"); + DEBUG(dbgs() << "Kernel config arguments:" << "\n"); + SourceManager *SM = Result.SourceManager; + LangOptions DefaultLangOptions; for (unsigned argno = 0; argno < config->getNumArgs(); argno++) { const Expr *arg = config->getArg(argno); if (!isa(arg)) { - const ParmVarDecl *pvd = - config->getDirectCallee()->getParamDecl(argno); - + const ParmVarDecl *pvd = config->getDirectCallee()->getParamDecl(argno); SourceLocation sl(arg->getLocStart()); SourceLocation el(arg->getLocEnd()); - SourceLocation stop = - Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions); - StringRef outs(SM->getCharacterData(sl), - SM->getCharacterData(stop) - SM->getCharacterData(sl)); - DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <" - << pvd->getType().getAsString() << ">" - << "\n"); - if (pvd->getType().getAsString().compare("dim3") == 0) + SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions); + StringRef outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl)); + DEBUG(dbgs() << "args[ " << argno << "]" << outs << " <" << pvd->getType().getAsString() << ">\n"); + if (pvd->getType().getAsString().compare("dim3") == 0) { OS << " dim3(" << outs << "),"; - else + } else { OS << " " << outs << ","; - } else + } + } else { OS << " 0,"; + } } - for (unsigned argno = 0; argno < launchKernel->getNumArgs(); argno++) { const Expr *arg = launchKernel->getArg(argno); SourceLocation sl(arg->getLocStart()); SourceLocation el(arg->getLocEnd()); - SourceLocation stop = - Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions); - std::string outs(SM->getCharacterData(sl), - SM->getCharacterData(stop) - SM->getCharacterData(sl)); + SourceLocation stop = Lexer::getLocForEndOfToken(el, 0, *SM, DefaultLangOptions); + std::string outs(SM->getCharacterData(sl), SM->getCharacterData(stop) - SM->getCharacterData(sl)); DEBUG(dbgs() << outs << "\n"); OS << " " << outs << ","; } XStr.pop_back(); OS << ")"; - size_t length = - SM->getCharacterData(Lexer::getLocForEndOfToken( - launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) - - SM->getCharacterData(launchKernel->getLocStart()); + size_t length = SM->getCharacterData(Lexer::getLocForEndOfToken( + launchKernel->getLocEnd(), 0, *SM, DefaultLangOptions)) - + SM->getCharacterData(launchKernel->getLocStart()); Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str()); Replace->insert(Rep); - countReps[ConvTypes::CONV_KERN]++; + countReps[CONV_KERN]++; + countApiReps[API_RUNTIME]++; + return true; } + return false; + } - if (const FunctionTemplateDecl *templateDecl = - Result.Nodes.getNodeAs( - "unresolvedTemplateName")) { - FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl(); - convertKernelDecl(kernelDecl, Result); - } - - if (const MemberExpr *threadIdx = - Result.Nodes.getNodeAs("cudaBuiltin")) { + bool cudaBuiltin(const MatchFinder::MatchResult &Result) { + if (const MemberExpr *threadIdx = Result.Nodes.getNodeAs("cudaBuiltin")) { if (const OpaqueValueExpr *refBase = - dyn_cast(threadIdx->getBase())) { + dyn_cast(threadIdx->getBase())) { if (const DeclRefExpr *declRef = - dyn_cast(refBase->getSourceExpr())) { + dyn_cast(refBase->getSourceExpr())) { StringRef name = declRef->getDecl()->getName(); StringRef memberName = threadIdx->getMemberDecl()->getName(); size_t pos = memberName.find_first_not_of("__fetch_builtin_"); @@ -1375,51 +1370,66 @@ public: name = Twine(name + "." + memberName).toStringRef(tmpData); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; SourceLocation sl = threadIdx->getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } } } + return true; } + return false; + } - if (const DeclRefExpr *cudaEnumConstantRef = - Result.Nodes.getNodeAs("cudaEnumConstantRef")) { - StringRef name = cudaEnumConstantRef->getDecl()->getNameAsString(); + bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) { + if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs("cudaEnumConstantRef")) { + StringRef name = enumConstantRef->getDecl()->getNameAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - SourceLocation sl = cudaEnumConstantRef->getLocStart(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + SourceLocation sl = enumConstantRef->getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } + return true; } + return false; + } - if (const VarDecl *cudaEnumConstantDecl = - Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { + bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { StringRef name = - cudaEnumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); + enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); // anonymous typedef enum if (name.empty()) { - QualType QT = cudaEnumConstantDecl->getType().getUnqualifiedType(); + QualType QT = enumConstantDecl->getType().getUnqualifiedType(); name = QT.getAsString(); } const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - SourceLocation sl = cudaEnumConstantDecl->getLocStart(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + SourceLocation sl = enumConstantDecl->getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } + return true; } + return false; + } - if (const VarDecl *cudaTypedefVar = - Result.Nodes.getNodeAs("cudaTypedefVar")) { - QualType QT = cudaTypedefVar->getType(); + bool cudaTypedefVar(const MatchFinder::MatchResult &Result) { + if (const VarDecl *typedefVar = Result.Nodes.getNodeAs("cudaTypedefVar")) { + QualType QT = typedefVar->getType(); if (QT->isArrayType()) { QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType(); } @@ -1427,33 +1437,87 @@ public: StringRef name = QT.getAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - SourceLocation sl = cudaTypedefVar->getLocStart(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + SourceLocation sl = typedefVar->getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } + return true; } + return false; + } - if (const VarDecl *cudaStructVar = - Result.Nodes.getNodeAs("cudaStructVar")) { - StringRef name = cudaStructVar->getType() - ->getAsStructureType() - ->getDecl() - ->getNameAsString(); + bool cudaStructVar(const MatchFinder::MatchResult &Result) { + if (const VarDecl *structVar = Result.Nodes.getNodeAs("cudaStructVar")) { + StringRef name = structVar->getType() + ->getAsStructureType() + ->getDecl() + ->getNameAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - TypeLoc TL = cudaStructVar->getTypeSourceInfo()->getTypeLoc(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } + return true; } + return false; + } - if (const VarDecl *sharedVar = - Result.Nodes.getNodeAs("cudaSharedIncompleteArrayVar")) { + bool cudaStructVarPtr(const MatchFinder::MatchResult &Result) { + if (const VarDecl *structVarPtr = Result.Nodes.getNodeAs("cudaStructVarPtr")) { + const Type *t = structVarPtr->getType().getTypePtrOrNull(); + if (t) { + StringRef name = t->getPointeeCXXRecordDecl()->getName(); + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + TypeLoc TL = structVarPtr->getTypeSourceInfo()->getTypeLoc(); + SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + SourceManager *SM = Result.SourceManager; + Replacement Rep(*SM, sl, name.size(), repName); + Replace->insert(Rep); + } + } + return true; + } + return false; + } + + bool cudaStructSizeOf(const MatchFinder::MatchResult &Result) { + if (const UnaryExprOrTypeTraitExpr *expr = Result.Nodes.getNodeAs("cudaStructSizeOf")) { + TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo(); + QualType QT = typeInfo->getType().getUnqualifiedType(); + const Type *type = QT.getTypePtr(); + StringRef name = type->getAsCXXRecordDecl()->getName(); + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + StringRef repName = found->second.hipName; + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + TypeLoc TL = typeInfo->getTypeLoc(); + SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + SourceManager *SM = Result.SourceManager; + Replacement Rep(*SM, sl, name.size(), repName); + Replace->insert(Rep); + } + return true; + } + return false; + } + + bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { + if (const VarDecl *sharedVar = Result.Nodes.getNodeAs("cudaSharedIncompleteArrayVar")) { // Example: extern __shared__ uint sRadix1[]; if (sharedVar->hasExternalFormalLinkage()) { QualType QT = sharedVar->getType(); @@ -1477,6 +1541,7 @@ public: if (!typeName.empty()) { SourceLocation slStart = sharedVar->getLocStart(); SourceLocation slEnd = sharedVar->getLocEnd(); + SourceManager *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; SmallString<128> tmpData; StringRef varName = sharedVar->getNameAsString(); @@ -1484,30 +1549,17 @@ public: Replacement Rep(*SM, slStart, repLength, repName); Replace->insert(Rep); countReps[CONV_MEM]++; + countApiReps[API_RUNTIME]++; } } + return true; } + return false; + } - if (const VarDecl *cudaStructVarPtr = - Result.Nodes.getNodeAs("cudaStructVarPtr")) { - const Type *t = cudaStructVarPtr->getType().getTypePtrOrNull(); - if (t) { - StringRef name = t->getPointeeCXXRecordDecl()->getName(); - const auto found = N.cuda2hipRename.find(name); - if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; - StringRef repName = found->second.hipName; - TypeLoc TL = cudaStructVarPtr->getTypeSourceInfo()->getTypeLoc(); - SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); - Replacement Rep(*SM, sl, name.size(), repName); - Replace->insert(Rep); - } - } - } - - if (const ParmVarDecl *cudaParamDecl = - Result.Nodes.getNodeAs("cudaParamDecl")) { - QualType QT = cudaParamDecl->getOriginalType().getUnqualifiedType(); + bool cudaParamDecl(const MatchFinder::MatchResult &Result) { + if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs("cudaParamDecl")) { + QualType QT = paramDecl->getOriginalType().getUnqualifiedType(); StringRef name = QT.getAsString(); const Type *t = QT.getTypePtr(); if (t->isStructureOrClassType()) { @@ -1515,73 +1567,104 @@ public: } const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - TypeLoc TL = cudaParamDecl->getTypeSourceInfo()->getTypeLoc(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + TypeLoc TL = paramDecl->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } + return true; } + return false; + } - if (const ParmVarDecl *cudaParamDeclPtr = - Result.Nodes.getNodeAs("cudaParamDeclPtr")) { - const Type *pt = cudaParamDeclPtr->getType().getTypePtrOrNull(); + bool cudaParamDeclPtr(const MatchFinder::MatchResult &Result) { + if (const ParmVarDecl *paramDeclPtr = Result.Nodes.getNodeAs("cudaParamDeclPtr")) { + const Type *pt = paramDeclPtr->getType().getTypePtrOrNull(); if (pt) { QualType QT = pt->getPointeeType(); const Type *t = QT.getTypePtr(); StringRef name = t->isStructureOrClassType() - ? t->getAsCXXRecordDecl()->getName() - : StringRef(QT.getAsString()); + ? t->getAsCXXRecordDecl()->getName() + : StringRef(QT.getAsString()); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; StringRef repName = found->second.hipName; - TypeLoc TL = cudaParamDeclPtr->getTypeSourceInfo()->getTypeLoc(); + countReps[found->second.countType]++; + countApiReps[found->second.countApiType]++; + TypeLoc TL = paramDeclPtr->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, sl, name.size(), repName); Replace->insert(Rep); } } + return true; } + return false; + } - if (const StringLiteral *stringLiteral = - Result.Nodes.getNodeAs("stringLiteral")) { - if (stringLiteral->getCharByteWidth() == 1) { - StringRef s = stringLiteral->getString(); - processString(s, N, Replace, *SM, stringLiteral->getLocStart(), - countReps); + bool unresolvedTemplateName(const MatchFinder::MatchResult &Result) { + if (const FunctionTemplateDecl *templateDecl = Result.Nodes.getNodeAs("unresolvedTemplateName")) { + FunctionDecl *kernelDecl = templateDecl->getTemplatedDecl(); + convertKernelDecl(kernelDecl, Result); + return true; + } + return false; + } + + bool stringLiteral(const MatchFinder::MatchResult &Result) { + if (const StringLiteral *sLiteral = Result.Nodes.getNodeAs("stringLiteral")) { + if (sLiteral->getCharByteWidth() == 1) { + StringRef s = sLiteral->getString(); + SourceManager *SM = Result.SourceManager; + processString(s, N, Replace, *SM, sLiteral->getLocStart(), countReps, countApiReps); } + return true; } + return false; + } - if (const UnaryExprOrTypeTraitExpr *expr = - Result.Nodes.getNodeAs( - "cudaStructSizeOf")) { - TypeSourceInfo *typeInfo = expr->getArgumentTypeInfo(); - QualType QT = typeInfo->getType().getUnqualifiedType(); - const Type *type = QT.getTypePtr(); - StringRef name = type->getAsCXXRecordDecl()->getName(); - const auto found = N.cuda2hipRename.find(name); - if (found != N.cuda2hipRename.end()) { - countReps[found->second.countType]++; - StringRef repName = found->second.hipName; - TypeLoc TL = typeInfo->getTypeLoc(); - SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); - Replacement Rep(*SM, sl, name.size(), repName); - Replace->insert(Rep); - } - } +public: + Cuda2HipCallback(Replacements *Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks) + : Replace(Replace), owner(parent), PP(PPCallbacks) { + PP->setMatch(this); + } + void run(const MatchFinder::MatchResult &Result) override { + do { + if (cudaCall(Result)) break; + if (cudaLaunchKernel(Result)) break; + if (cudaBuiltin(Result)) break; + if (cudaEnumConstantRef(Result)) break; + if (cudaEnumConstantDecl(Result)) break; + if (cudaTypedefVar(Result)) break; + if (cudaStructVar(Result)) break; + if (cudaStructVarPtr(Result)) break; + if (cudaStructSizeOf(Result)) break; + if (cudaSharedIncompleteArrayVar(Result)) break; + if (cudaParamDecl(Result)) break; + if (cudaParamDeclPtr(Result)) break; + if (stringLiteral(Result)) break; + if (unresolvedTemplateName(Result)) break; + break; + } while (false); if (PP->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) { StringRef repName = "#include \n"; + SourceManager *SM = Result.SourceManager; Replacement Rep(*SM, SM->getLocForStartOfFile(SM->getMainFileID()), 0, repName); Replace->insert(Rep); countReps[CONV_INCLUDE_CUDA_MAIN_H]++; + countApiReps[API_RUNTIME]++; } } - int64_t countReps[ConvTypes::CONV_LAST] = {0}; + int64_t countReps[CONV_LAST] = { 0 }; + int64_t countApiReps[API_LAST] = { 0 }; private: Replacements *Replace; @@ -1592,11 +1675,12 @@ private: void HipifyPPCallbacks::handleEndSource() { if (Match->countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && - countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) { + countReps[CONV_INCLUDE_CUDA_MAIN_H] == 0 && Replace->size() > 0) { StringRef repName = "#include \n"; Replacement Rep(*_sm, _sm->getLocForStartOfFile(_sm->getMainFileID()), 0, repName); Replace->insert(Rep); countReps[CONV_INCLUDE_CUDA_MAIN_H]++; + countApiReps[API_RUNTIME]++; } } @@ -1621,18 +1705,90 @@ static cl::opt static cl::opt PrintStats("print-stats", cl::desc("print the command-line, like a header"), - cl::value_desc("print-stats")); + cl::value_desc("print-stats")); + +void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callback) { + Finder.addMatcher(callExpr(isExpansionInMainFile(), + callee(functionDecl(matchesName("cuda.*|cublas.*")))) + .bind("cudaCall"), + Callback); + Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), Callback); + Finder.addMatcher(memberExpr(isExpansionInMainFile(), + hasObjectExpression(hasType(cxxRecordDecl( + matchesName("__cuda_builtin_"))))) + .bind("cudaBuiltin"), + Callback); + Finder.addMatcher(declRefExpr(isExpansionInMainFile(), + to(enumConstantDecl( + matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*")))) + .bind("cudaEnumConstantRef"), + Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(enumDecl())) + .bind("cudaEnumConstantDecl"), + Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(typedefDecl(matchesName("cuda.*|cublas.*")))) + .bind("cudaTypedefVar"), + Callback); + // Array of elements of typedef type, Example: cudaStream_t streams[2]; + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(arrayType(hasElementType(typedefType( + hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*")))))))) + .bind("cudaTypedefVar"), + Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*")))) + .bind("cudaStructVar"), + Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(pointsTo(cxxRecordDecl( + matchesName("cuda.*|cublas.*"))))) + .bind("cudaStructVarPtr"), + Callback); + Finder.addMatcher(parmVarDecl(isExpansionInMainFile(), + hasType(namedDecl(matchesName("cuda.*|cublas.*")))) + .bind("cudaParamDecl"), + Callback); + Finder.addMatcher(parmVarDecl(isExpansionInMainFile(), + hasType(pointsTo(namedDecl( + matchesName("cuda.*|cublas.*"))))) + .bind("cudaParamDeclPtr"), + Callback); + Finder.addMatcher(expr(isExpansionInMainFile(), + sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration( + cxxRecordDecl(matchesName("cuda.*|cublas.*"))))))) + .bind("cudaStructSizeOf"), + Callback); + Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"), + Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf( + hasAttr(attr::CUDAShared), + hasType(incompleteArrayType()))) + .bind("cudaSharedIncompleteArrayVar"), + Callback); +} + +void printStats(std::string fileSource, HipifyPPCallbacks &PPCallbacks, Cuda2HipCallback &Callback) { + int64_t sum = 0; + for (int i = 0; i < CONV_LAST; i++) { + sum += Callback.countReps[i] + PPCallbacks.countReps[i]; + } + llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( "; + for (int i = 0; i < CONV_LAST; i++) { + llvm::outs() << counterNames[i] << ':' << Callback.countReps[i] + PPCallbacks.countReps[i] << ' '; + } + llvm::outs() << "), by APIs ( "; + for (int i = 0; i < API_LAST; i++) { + llvm::outs() << apiNames[i] << ':' << Callback.countApiReps[i] + PPCallbacks.countApiReps[i] << ' '; + } + llvm::outs() << ") in \'" << fileSource << "\'\n"; +} int main(int argc, const char **argv) { - llvm::sys::PrintStackTraceOnErrorSignal(); - - int Result; - CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::Required); - std::vector fileSources = OptionsParser.getSourcePathList(); - std::string dst = OutputFilename; if (dst.empty()) { dst = fileSources[0]; @@ -1651,7 +1807,6 @@ int main(int argc, const char **argv) { } dst += ".cu"; } - // copy source file since tooling makes changes "inplace" std::ifstream source(fileSources[0], std::ios::binary); std::ofstream dest(Inplace ? dst + ".prehip" : dst, std::ios::binary); @@ -1664,84 +1819,19 @@ int main(int argc, const char **argv) { HipifyPPCallbacks PPCallbacks(&Tool.getReplacements()); Cuda2HipCallback Callback(&Tool.getReplacements(), &Finder, &PPCallbacks); - Finder.addMatcher(callExpr(isExpansionInMainFile(), - callee(functionDecl(matchesName("cuda.*|cublas.*")))) - .bind("cudaCall"), - &Callback); - Finder.addMatcher(cudaKernelCallExpr().bind("cudaLaunchKernel"), &Callback); - Finder.addMatcher(memberExpr(isExpansionInMainFile(), - hasObjectExpression(hasType(cxxRecordDecl( - matchesName("__cuda_builtin_"))))) - .bind("cudaBuiltin"), - &Callback); - Finder.addMatcher(declRefExpr(isExpansionInMainFile(), - to(enumConstantDecl( - matchesName("cuda.*|cublas.*|CUDA.*|CUBLAS*")))) - .bind("cudaEnumConstantRef"), - &Callback); - Finder.addMatcher(varDecl(isExpansionInMainFile(), - hasType(enumDecl())) - .bind("cudaEnumConstantDecl"), - &Callback); - Finder.addMatcher(varDecl(isExpansionInMainFile(), - hasType(typedefDecl(matchesName("cuda.*|cublas.*")))) - .bind("cudaTypedefVar"), - &Callback); - // Array of elements of typedef type, Example: cudaStream_t streams[2]; - Finder.addMatcher(varDecl(isExpansionInMainFile(), - hasType(arrayType(hasElementType(typedefType( - hasDeclaration(typedefDecl(matchesName("cuda.*|cublas.*")))))))) - .bind("cudaTypedefVar"), - &Callback); - Finder.addMatcher(varDecl(isExpansionInMainFile(), - hasType(cxxRecordDecl(matchesName("cuda.*|cublas.*")))) - .bind("cudaStructVar"), - &Callback); - Finder.addMatcher(varDecl(isExpansionInMainFile(), - hasType(pointsTo(cxxRecordDecl( - matchesName("cuda.*|cublas.*"))))) - .bind("cudaStructVarPtr"), - &Callback); - Finder.addMatcher(parmVarDecl(isExpansionInMainFile(), - hasType(namedDecl(matchesName("cuda.*|cublas.*")))) - .bind("cudaParamDecl"), - &Callback); - Finder.addMatcher(parmVarDecl(isExpansionInMainFile(), - hasType(pointsTo(namedDecl( - matchesName("cuda.*|cublas.*"))))) - .bind("cudaParamDeclPtr"), - &Callback); - Finder.addMatcher(expr(isExpansionInMainFile(), - sizeOfExpr(hasArgumentOfType(recordType(hasDeclaration( - cxxRecordDecl(matchesName("cuda.*|cublas.*"))))))) - .bind("cudaStructSizeOf"), - &Callback); - Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"), - &Callback); - Finder.addMatcher(varDecl(isExpansionInMainFile(), allOf( - hasAttr(attr::CUDAShared), - hasType(incompleteArrayType()))) - .bind("cudaSharedIncompleteArrayVar"), - &Callback); + addAllMatchers(Finder, &Callback); auto action = newFrontendActionFactory(&Finder, &PPCallbacks); - - std::vector compilationStages; + std::vector compilationStages; compilationStages.push_back("--cuda-host-only"); - - for (auto Stage : compilationStages) { - Tool.appendArgumentsAdjuster( - getInsertArgumentAdjuster(Stage, ArgumentInsertPosition::BEGIN)); - Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11")); + Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster(compilationStages[0], ArgumentInsertPosition::BEGIN)); + Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11")); #if defined(HIPIFY_CLANG_RES) - Tool.appendArgumentsAdjuster( - getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES)); + Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES)); #endif - Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster()); - Result = Tool.run(action.get()); - - Tool.clearArgumentsAdjusters(); - } + Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster()); + int Result = Tool.run(action.get()); + Tool.clearArgumentsAdjusters(); LangOptions DefaultLangOptions; IntrusiveRefCntPtr DiagOpts = new DiagnosticOptions(); @@ -1749,13 +1839,13 @@ int main(int argc, const char **argv) { DiagnosticsEngine Diagnostics( IntrusiveRefCntPtr(new DiagnosticIDs()), &*DiagOpts, &DiagnosticPrinter, false); - SourceManager Sources(Diagnostics, Tool.getFiles()); DEBUG(dbgs() << "Replacements collected by the tool:\n"); for (const auto &r : Tool.getReplacements()) { DEBUG(dbgs() << r.toString() << "\n"); } + SourceManager Sources(Diagnostics, Tool.getFiles()); Rewriter Rewrite(Sources, DefaultLangOptions); if (!Tool.applyAllReplacements(Rewrite)) { @@ -1771,16 +1861,8 @@ int main(int argc, const char **argv) { } } if (PrintStats) { - int64_t sum = 0; - for (int i = 0; i < ConvTypes::CONV_LAST; i++) { - sum += Callback.countReps[i] + PPCallbacks.countReps[i]; - } - llvm::outs() << "info: converted " << sum << " CUDA->HIP refs ( "; - for (int i = 0; i < ConvTypes::CONV_LAST; i++) { - llvm::outs() << counterNames[i] << ':' - << Callback.countReps[i] + PPCallbacks.countReps[i] << ' '; - } - llvm::outs() << ") in \'" << fileSources[0] << "\'\n"; + printStats(fileSources[0], PPCallbacks, Callback); } + return Result; } diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index 189f172c95..2d589ec415 100644 --- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -42,12 +42,12 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | `cudaStreamAddCallback` | | Add a callback to a compute stream. | -| `cudaStreamAttachMemAsync` | | Attach memory to a stream asynchronously. | +| `cudaStreamAttachMemAsync` | | Attach managed memory to a stream asynchronously. | | `cudaStreamCreate` | `hipStreamCreate` | Create an asynchronous stream. | | `cudaStreamCreateWithFlags` | `hipStreamCreateWithFlags` | Create an asynchronous stream. | | `cudaStreamCreateWithPriority` | | Create an asynchronous stream with the specified priority. | | `cudaStreamDestroy` | `hipStreamDestroy` | Destroys and cleans up an asynchronous stream. | -| `cudaStreamGetFlags` | | Query the flags of a stream. | +| `cudaStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a stream. | | `cudaStreamGetPriority` | | Query the priority of a stream. | | `cudaStreamQuery` | | Queries an asynchronous stream for completion status. | | `cudaStreamSynchronize` | `hipStreamSynchronize` | Waits for stream tasks to complete. | @@ -100,8 +100,8 @@ | `cudaHostAlloc` | `hipHostMalloc` | Allocates page-locked memory on the host. | | `cudaHostGetDevicePointer` | `hipHostGetDevicePointer` | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister. | | `cudaHostGetFlags` | `hipHostGetFlags` | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc. | -| `cudaHostRegister` | | Registers an existing host memory range for use by CUDA. | -| `cudaHostUnregister` | | Unregisters a memory range that was registered with cudaHostRegister. | +| `cudaHostRegister` | `hipHostRegister` | Registers an existing host memory range for use by CUDA. | +| `cudaHostUnregister` | `hipHostUnregister` | Unregisters a memory range that was registered with cudaHostRegister. | | `cudaMalloc` | `hipMalloc` | Allocate memory on the device. | | `cudaMalloc3D` | | Allocates logical 1D, 2D, or 3D memory objects on the device. | | `cudaMalloc3DArray` | | Allocate an array on the device. | @@ -231,7 +231,7 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| `cudaBindSurfaceToArra`y | | Binds an array to a surface. | +| `cudaBindSurfaceToArray` | | Binds an array to a surface. | | `cudaBindTexture` | | Binds a memory area to a texture. | | `cudaBindTexture2D` | | Binds a 2D memory area to a texture. | | `cudaBindTextureToArray` | | Binds an array to a texture. | diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index 8b50a9a8c7..d62f40510a 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -32,36 +32,52 @@ HIP provides the following: - Devices (hipSetDevice(), hipGetDeviceProperties(), etc.) - Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.) -- Streams (hipStreamCreate(), etc.) +- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.) - Events (hipEventRecord(), hipEventElapsedTime(), etc.) - Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>) +- HIP Module API to control when adn how code is loaded. - CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim) +- Cross-lane instructions including shfl, ballot, any, all - Most device-side math built-ins - Error reporting (hipGetLastError(), hipGetErrorString()) The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API. ### What is not supported? -#### Run-time features +#### Runtime/Driver API features +At a high-level, the following features are not supported: - Textures -- MemcpyToSymbol functions - Dynamic parallelism (CUDA 5.0) - Managed memory (CUDA 6.5) - Graphics interoperation with OpenGL or Direct3D +- CUDA Driver API (Under Development) +- CUDA IPC Functions (Under Development) + - CUDA array, mipmappedArray and pitched memory -- CUDA Driver API +- MemcpyToSymbol functions +- Queue priority controls + +See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information. #### Kernel language features - Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0) - Virtual functions, indirect functions and try/catch (CUDA 4.0) - `__prof_trigger` -- PTX assembly (CUDA 4.0) -- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information. +- PTX assembly (CUDA 4.0). HCC supports inline GCN assembly. +- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information. These include: + - printf + - assert + - `__restrict__` + - `__launch_bounds__` + - `__threadfence*_`, `__syncthreads*` + - Unbounded loop unroll + + ### Is HIP a drop-in replacement for CUDA? No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs. Most developers will port their code from CUDA to HIP and then maintain the HIP version. -HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms. +HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms. ### What specific version of CUDA does HIP support? HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index 88f0237706..ba52b88433 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before ### `__shared__` The `__shared__` keyword is supported. -`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development. +`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro. ### `__managed__` Managed memory, including the `__managed__` keyword, are not supported in HIP. @@ -537,7 +537,6 @@ HIP supports the following atomic operations. ### Caveats and Features Under-Development: - HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently. -- wrapping increment and decrement are under development. ## Warp Cross-Lane Functions @@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi ### Warp Shuffle Functions -The following warp shuffle instructions are under development. - Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64. ``` @@ -670,3 +667,22 @@ The following C++ features are not supported: - Run-time-type information (RTTI) - Virtual functions - Try/catch + +## Kernel Compilation +HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`. + +`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]` +```[TARGET GPU] = fiji/hawaii +[INPUT FILE] = Name of the file containing kernels +[OUTPUT FILE] = Name of the generated code object file``` + +Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation + +You need 3 things to run kernel in binary. +1. Kernel Binary +2. Name of kernel binary +3. Name of the kernel + +We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++. + + diff --git a/docs/markdown/hip_performance.md b/docs/markdown/hip_performance.md new file mode 100644 index 0000000000..98197b3db7 --- /dev/null +++ b/docs/markdown/hip_performance.md @@ -0,0 +1,42 @@ +# HIP Performance Optimizations + +Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform. + +### Unpinned Memory Transfer Optimizations + +#### On Small BAR Setup + +There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H) + * Using Staging Buffers + * Using PinInPlace + +#### On Large BAR Setup + +There are two possible ways to transfer data from Host to Device (H2D) + * Using Staging Buffers + * Using PinInPlace + * Direct Memcpy + + And there are two possible ways to transfer data from Device to Host (D2H) + * Using Staging Buffers + * Using PinInPlace + +Some GPUs may not be able to directly access host memory, and in these cases we need to +stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies. + +PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA +engine. + +By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!) + +Following environment variables can be used: + +- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies + +- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables: + - HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. + - HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. + - HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. + + + diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md new file mode 100644 index 0000000000..b0ac3ecf1d --- /dev/null +++ b/docs/markdown/hip_porting_driver_api.md @@ -0,0 +1,257 @@ +# Porting CUDA Driver API + +## Introduction to the CUDA Driver and Runtime APIs +CUDA provides a separate CUDA Driver and Runtime APIs. The two APis have significant overlap in functionality: +- Both APIs support events, streams, memory management, memory copy, and error handling. +- Both APIs deliver similar performance. +- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality. +- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue` + + +The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs. + +### cuModule API +The Module section of the Driver API provides additional control over how and when accelerator + code objects are loaded. +For example, the driver API allows code objects to be loaded from files or memory pointers. +Symbols for kernels or global data can be extracted from the loaded code objects. +In contrast, the Runtime API automatically loads and (if necessary) compiles all of the +kernels from a executable binary when run. +In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly. + +Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`. +The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function. +The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles +a special function call and is easier to use than explicit launch API (in particular with respect to +handling of kernel arguments). +However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code. + +The Module features are useful in an environment which generate the code objects directly, such as a new +accelerator language front-end. Here, NVCC is not used. Instead, the environment may have a +different kernel language or different compilation flow. +Other environments have many kernels and do not want them to be all loaded automatically. +The Module functions can be used to load the generated code objects and launch kernels. +As we will see below, HIP defines a Module API which provides similar explict control over code +object managemenet. + +### cuCtx API +The Driver API defines "Context" and "Devices" as separate entities. +Contexts contain a single device, and a device can theoretically have multiple contexts. +Each context contains a set of streams and events specific to the context. +Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space). +The Context APIs also provide a mechanism to switch between devices, which allowed +a single CPU thread to send commands to different GPUs. HIP as well as a recent versions +of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or +`cudaSetDevice`. + +The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs +and has little loss of functionality since each Context can contain a single device, +and the benefits of multiple contexts has been replaced with other interfaces. HIP provides +a context API to facilitate easy porting from existing Driver codes. +In HIP, the Ctx functions largely provide an alternate syntax for changing the active device. +Most new applications will prefer to use `hipSetDevice` or the stream APIs. + +## HIP Module and Ctx APIs + +Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules +and Ctx control. + +### hipModule API + +Like the CUDA Driver API, the Module API provides additional control +over how code is loaded, including options to load code from files or from in-memory pointers. +NVCC and HCC target different architectures and use different code object formats : NVCC +is `cubin` or `ptx` files, while the HCC path is the `hsaco` format. The external +compilers which generate these code objects are responsible for generating and loading +the correct code object for each platform. Notably, there is not a fat binary format that +can contain code for both NVCC and HCC platforms. The following table summarizes the +formats used on each platform: + +| Format | APIs | NVCC | HCC | +| --- | --- | --- | --- | +| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco | +| Fat Binary | hipModuleLoadFatBin | .fatbin | Under Development | + +hipcc uses NVCC and HCC to compile host codes. Both of these may embed code objects +into the final executable, and these code objects will be automatically loaded when +the application starts. +The hipModule API can be used to load additional code objects, and in this way +provides an extended capability to the automatically loaded code objects. HCC allows +both of these capabilities to be used together, if desired. Of course it is possible +to create a program with no kernels and thus no automatic loading. + + +### hipCtx API +HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API +can be used to set the current context, or to query properties of the device associated with +the context. The current context is implicitly used by other APIs such as `hipStreamCreate`. + +### hipify translation of CUDA Driver API +The hipify tool will convert CUDA Driver APIs for streams, events, memory management to +the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to +`hipEventCreate`. Hipify also converts error code from the Driver namespace and coding +convention to the equivalent HIP error code. Thus, HIP unifies the APis for these common functions. +[hipify support for translating driver API is Under Development] + +The memory copy APIs require additional explanation. The CUDA driver includes the memory +direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides +a single memory copy API with a parameter that specifies the direction and additionally +supports a "default" direction where the runtime determines the direction automatically. +HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`. +The first flavor may be faster in some cases since they avoid host overhead to detect the +different memory directions. + +HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`). + +### HCC Implementation Notes +#### .hsaco +The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc). +An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program). hsaco can be generated by hcc + extractkernel tool, +cloc, the GCN assembler, or other tools. + +#### Address Spaces +HCC defines a process-wide address space where the CPU and all devices allocate +addresses from a single unified pool. Thus addresses may be shared between contexts, and +unlike the original CUDA definition a new context does not create a new address space for +the device. + +#### Using hipModuleLaunchKernel +`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms. + +#### Additional Information +- HCC allocates staging buffers (used for unpinned copies) on a per-device basis. +- HCC creates a primary context when the HIP API is called. So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack. HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs. + +### NVCC Implementation Notes + +#### Interoperation between HIP and CUDA Driver +CUDA applications may want to mix CUDA driver code with HIP code (see example below). This +table shows the type equivalence to enable this interaction. + +|**HIP Type** |**CU Driver Type**| **CUDA Runtime Type** | +| ---- | ---- | ---- | +| hipModule | CUmodule | | +| hipFunction | CUfunction | | +| hipCtx_t | CUcontext | | +| hipDevice_t | CUdevice | | +| hipStream_t | CUstream | cudaStream_t | +| hipEvent_t | CUevent | cudaEvent_t | +| hipArray | CUarray | cudaArray | + + + +#### Compilation Flags +The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options. +HCC does not use PTX and does not support the same compilation options. +In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step. +Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx` +on the NVCC path and hipModuleLoad on the hcc path. For example: + +``` +hipModule module; +void *imagePtr = ... ; // Somehow populate data pointer with code object + +#ifdef __HIP_PLATFORM_NVCC__ +// Use CUDA driver API but write to hipModule since they are same type: +const int numOptions = 1; +CUJit_option options[numOptions]; +void * optionValues[numOptions]; + +options[0] = CU_JIT_MAX_REGISTERS; +unsigned maxRegs=15; +optionValues[0] = (void*) (&maxRegs); + +cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + +#else // __HIP_PLATFORM_HCC__ + +// HCC path does not support or require JIT options, so just load the module. +hipModuleLoadData(&module, imagePtr); + +#endif + +// Back to unified code - both paths above loaded the "module" variable. +hipFunction k; +hipModuleGetFunction(&k, module, "myKernel"); +``` + +The below sample shows how to use `hipModuleGetFunction`. + +``` +#include +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#ifdef __HIP_PLATFORM_HCC__ +#define fileName "vcpy_isa.co" +#endif + +#ifdef __HIP_PLATFORM_NVCC__ +#define fileName "vcpy_isa.ptx" +#endif + +#define kernel_name "hello_world" + +int main(){ + float *A, *B; + hipDeviceptr_t Ad, Bd; + A = new float[LEN]; + B = new float[LEN]; + + for(uint32_t i=0;iargBuffer(2); + memcpy(&argBuffer[0], &Ad, sizeof(void*)); + memcpy(&argBuffer[1], &Bd, sizeof(void*)); + + size_t size = argBuffer.size()*sizeof(void*); + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con fig); + + hipMemcpyDtoH(B, Bd, SIZE); + for(uint32_t i=0;iTexture support is under-development and not yet supported by HIP. diff --git a/include/hcc_detail/hipComplex.h b/include/hcc_detail/hipComplex.h new file mode 100644 index 0000000000..910cee946d --- /dev/null +++ b/include/hcc_detail/hipComplex.h @@ -0,0 +1,172 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +#ifndef HIPCOMPLEX_H +#define HIPCOMPLEX_H + +typedef struct{ + float x; + float y; +}hipFloatComplex; + +__device__ static inline float hipCrealf(hipFloatComplex z){ + return z.x; +} + +__device__ static inline float hipCimagf(hipFloatComplex z){ + return z.y; +} + +__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ + hipFloatComplex z; + z.x = a; + z.y = b; + return z; +} + +__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ + hipFloatComplex ret; + ret.x = z.x; + ret.y = -z.y; + return ret; +} + +__device__ static inline float hipCsqabsf(hipFloatComplex z){ + return z.x * z.x + z.y * z.y; +} + +__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ + return make_hipFloatComplex(p.x + q.x, p.y + q.y); +} + +__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ + return make_hipFloatComplex(p.x - q.x, p.y - q.y); +} + +__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ + return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ + float sqabs = hipCsqabsf(q); + hipFloatComplex ret; + ret.x = (p.x * q.x + p.y * q.y)/sqabs; + ret.y = (p.y * q.x - p.x * q.y)/sqabs; + return ret; +} + +__device__ static inline float hipCabsf(hipFloatComplex z){ + return sqrtf(hipCsqabsf(z)); +} + + +typedef struct{ + double x; + double y; +}hipDoubleComplex; + +__device__ static inline double hipCreal(hipDoubleComplex z){ + return z.x; +} + +__device__ static inline double hipCimag(hipDoubleComplex z){ + return z.y; +} + +__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ + hipDoubleComplex z; + z.x = a; + z.y = b; + return z; +} + +__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ + hipDoubleComplex ret; + ret.x = z.x; + ret.y = z.y; + return ret; +} + +__device__ static inline double hipCsqabs(hipDoubleComplex z){ + return z.x * z.x + z.y * z.y; +} + +__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ + return make_hipDoubleComplex(p.x + q.x, p.y + q.y); +} + +__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ + return make_hipDoubleComplex(p.x - q.x, p.y - q.y); +} + +__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){ + return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); +} + +__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ + double sqabs = hipCsqabs(q); + hipDoubleComplex ret; + ret.x = (p.x * q.x + p.y * q.y)/sqabs; + ret.y = (p.y * q.x - p.x * q.y)/sqabs; + return ret; +} + +__device__ static inline double hipCabs(hipDoubleComplex z){ + return sqrtf(hipCsqabs(z)); +} + +typedef hipFloatComplex hipComplex; + +__device__ static inline hipComplex make_hipComplex(float x, + float y){ + return make_hipFloatComplex(x, y); +} + +__device__ static inline hipFloatComplex hipComplexDoubleToFloat +(hipDoubleComplex z){ + return make_hipFloatComplex((float)z.x, (float)z.y); +} + +__device__ static inline hipDoubleComplex hipComplexFloatToDouble +(hipFloatComplex z){ + return make_hipDoubleComplex((double)z.x, (double)z.y); +} + +__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ + float real = (p.x * q.x) + r.x; + float imag = (q.x * p.y) + r.y; + + real = -(p.y * q.y) + real; + imag = (p.x * q.y) + imag; + + return make_hipComplex(real, imag); +} + +__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ + float real = (p.x * q.x) + r.x; + float imag = (q.x * p.y) + r.y; + + real = -(p.y * q.y) + real; + imag = (p.x * q.y) + imag; + + return make_hipDoubleComplex(real, imag); +} + +#endif diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 2f0c33a0d2..01c36afde4 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -160,12 +160,17 @@ class ihipCtx_t; #endif +// Just initialize the HIP runtime, but don't log any trace information. +#define HIP_INIT()\ + std::call_once(hip_initialized, ihipInit);\ + ihipCtxStackUpdate(); + // This macro should be called at the beginning of every HIP API. // It initialies the hip runtime (exactly once), and // generate trace string that can be output to stderr or to ATP file. #define HIP_INIT_API(...) \ - std::call_once(hip_initialized, ihipInit);\ + HIP_INIT()\ API_TRACE(__VA_ARGS__); #define ihipLogStatus(hipStatus) \ @@ -396,6 +401,22 @@ public: typedef ihipStreamCriticalBase_t ihipStreamCritical_t; typedef LockedAccessor LockedAccessor_StreamCrit_t; +class ihipModule_t{ +public: + hsa_executable_t executable; + hsa_code_object_t object; + std::string fileName; + void *ptr; + size_t size; +}; + + +class ihipFunction_t{ +public: + hsa_executable_symbol_t kernel_symbol; + uint64_t kernel; +}; + // Internal stream structure. class ihipStream_t { public: @@ -404,8 +425,9 @@ typedef uint64_t SeqNum_t ; ~ihipStream_t(); // kind is hipMemcpyKind - void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind); - void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind); + void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true); + void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true); + void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); @@ -423,7 +445,7 @@ typedef uint64_t SeqNum_t ; // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false); - + void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel); // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function. SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; }; @@ -443,6 +465,7 @@ public: hc::accelerator_view _av; unsigned _flags; + private: void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal); void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal); @@ -648,6 +671,8 @@ extern void ihipInit(); extern const char *ihipErrorString(hipError_t); extern ihipCtx_t *ihipGetTlsDefaultCtx(); extern void ihipSetTlsDefaultCtx(ihipCtx_t *ctx); +extern hipError_t ihipSynchronize(void); +extern hipError_t ihipCtxStackUpdate(); extern ihipDevice_t *ihipGetDevice(int); ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex); diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h index d009bec35b..727604d8d8 100644 --- a/include/hcc_detail/hip_runtime.h +++ b/include/hcc_detail/hip_runtime.h @@ -568,16 +568,66 @@ __device__ void __threadfence_system(void); #define hipGridDim_y (hc_get_num_groups(1)) #define hipGridDim_z (hc_get_num_groups(2)) +// loop unrolling +__device__ static inline void* memcpy(void* dst, void* src, size_t size) +{ + uint64_t i = 0; + uint64_t totalLength = size/sizeof(uint32_t); + for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x; + i<(totalLength/4); + i = i + hipBlockDim_x * hipGridDim_x) + { + ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i]; + ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1]; + ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2]; + ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3]; + } + if(4*i < totalLength){ + ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i]; + ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1]; + ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2]; + ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3]; + + } + return nullptr; +} + +__device__ static inline void* memset(void* ptr, uint8_t val, size_t size) +{ + uint32_t _val = 0; + _val = (val | val << 8 | val << 16 | val << 24); + uint64_t totalLength = size/sizeof(uint32_t); + uint64_t i = 0; + for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x; + i<(totalLength/4); + i = i + hipBlockDim_x * hipGridDim_x) + { + ((uint32_t*)ptr)[4*i] = _val; + ((uint32_t*)ptr)[4*i+1] = _val; + ((uint32_t*)ptr)[4*i+2] = _val; + ((uint32_t*)ptr)[4*i+3] = _val; + } + if(4*i < totalLength){ + ((uint32_t*)ptr)[4*i] = _val; + ((uint32_t*)ptr)[4*i+1] = _val; + ((uint32_t*)ptr)[4*i+2] = _val; + ((uint32_t*)ptr)[4*i+3] = _val; + + } + return nullptr; +} + #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) #define HIP_KERNEL_NAME(...) __VA_ARGS__ #ifdef __HCC_CPP__ -hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp); -hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp); -hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp); -hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp); -void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp); +extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp); +extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp); +extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp); +extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp); +extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream); +extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp); // TODO - move to common header file. #define KNRM "\x1B[0m" @@ -589,10 +639,9 @@ do {\ grid_launch_parm lp;\ lp.dynamic_group_mem_bytes = _groupMemBytes; \ hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \ - if (HIP_TRACE_API) {\ - fprintf(stderr, KGRN "< + +#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__) +#include +#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__) +#include +#else +#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__"); +#endif + diff --git a/include/nvcc_detail/hipComplex.h b/include/nvcc_detail/hipComplex.h new file mode 100644 index 0000000000..b5c182bd4d --- /dev/null +++ b/include/nvcc_detail/hipComplex.h @@ -0,0 +1,108 @@ +#ifndef HIPCOMPLEX_H +#define HIPCOMPLEX_H + +#include"cuComplex.h" + +typedef cuFloatComplex hipFloatComplex; + +__device__ __host__ static inline float hipCrealf(hipFloatComplex z){ + return cuCrealf(z); +} + +__device__ __host__ static inline float hipCimagf(hipFloatComplex z){ + return cuCimagf(z); +} + +__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ + return make_cuFloatComplex(a, b); +} + +__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ + return cuConjf(z); +} + +__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){ + return cuCabsf(z) * cuCabsf(z); +} + +__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ + return cuCaddf(p, q); +} + +__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ + return cuCsubf(p, q); +} + +__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ + return cuCmulf(p, q); +} + +__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ + return cuCdivf(p, q); +} + +__device__ __host__ static inline float hipCabsf(hipFloatComplex z){ + return cuCabsf(p, q); +} + +typedef cuDoubleComplex hipDoubleComplex; + +__device__ __host__ static inline double hipCreal(hipDoubleComplex z){ + return cuCreal(z); +} + +__device__ __host__ static inline double hipCimag(hipDoubleComplex z){ + return cuCimag(z); +} + +__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ + return make_cuDoubleComplex(a, b); +} + +__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ + return cuConj(z); +} + +__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){ + return cuCabs(z) * cuCabs(z); +} + +__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ + return cuCadd(p, q); +} + +__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ + return cuCsub(p, q); +} + +__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ + return cuCdiv(p, q); +} + +__device__ __host__ static inline double hipCabs(hipDoubleComplex z){ + return cuCabs(z); +} + +typedef cuFloatComplex hipComplex; + +__device__ __host__ static inline hipComplex make_Complex(float x, float y){ + return make_cuComplex(x, y); +} + +__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){ + return cuComplexDoubleToFloat(z); +} + +__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){ + return cuComplexFloatToDouble(z); +} + +__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ + return cuCfmaf(p, q, r); +} + +__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){ + return cuCfma(p, q, r); +} + +#endif diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index a51bca2d02..baeb080195 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -22,7 +22,7 @@ THE SOFTWARE. #pragma once #include - +#include #ifdef __cplusplus extern "C" { @@ -58,8 +58,20 @@ hipMemcpyHostToHost #define hipHostRegisterPortable cudaHostRegisterPortable #define hipHostRegisterMapped cudaHostRegisterMapped +#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER +#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE +#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END + typedef cudaEvent_t hipEvent_t; typedef cudaStream_t hipStream_t; +typedef CUcontext hipCtx_t; +typedef CUsharedconfig hipSharedMemConfig; +typedef CUfunc_cache hipFuncCache; +typedef CUdevice hipDevice_t; +typedef CUmodule hipModule_t; +typedef CUfunction hipFunction_t; +typedef CUdeviceptr hipDeviceptr_t; + //typedef cudaChannelFormatDesc hipChannelFormatDesc; #define hipChannelFormatDesc cudaChannelFormatDesc @@ -85,6 +97,20 @@ switch(cuError) { }; } +inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further +switch(cuError) { + case CUDA_SUCCESS : return hipSuccess; + case CUDA_ERROR_OUT_OF_MEMORY : return hipErrorMemoryAllocation ; + case CUDA_ERROR_INVALID_VALUE : return hipErrorInvalidValue ; + case CUDA_ERROR_INVALID_DEVICE : return hipErrorInvalidDevice ; + case CUDA_ERROR_DEINITIALIZED : return hipErrorDeinitialized ; + case CUDA_ERROR_NO_DEVICE : return hipErrorNoDevice ; + case CUDA_ERROR_INVALID_CONTEXT : return hipErrorInvalidContext ; + case CUDA_ERROR_NOT_INITIALIZED : return hipErrorNotInitialized ; + default : return hipErrorUnknown; // Note - translated error. +}; +} + // TODO match the error enum names of hip and cuda inline static cudaError_t hipErrorToCudaError(hipError_t hError) { switch(hError) { @@ -124,6 +150,11 @@ default: } } +inline static hipError_t hipInit(unsigned int flags) +{ + return hipCUResultTohipError(cuInit(flags)); +} + inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); } @@ -182,6 +213,25 @@ inline static hipError_t hipHostFree(void* ptr) { inline static hipError_t hipSetDevice(int device) { return hipCUDAErrorTohipError(cudaSetDevice(device)); } + +inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, + void* src, size_t size) +{ + return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size)); +} + +inline static hipError_t hipMemcpyDtoH(void* dst, + hipDeviceptr_t src, size_t size) +{ + return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size)); +} + +inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, + hipDeviceptr_t src, size_t size) +{ + return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size)); +} + inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) { return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind))); } @@ -347,20 +397,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att return hipCUDAErrorTohipError(cerror); } -template -inline static hipError_t hipOccupancyMaxPotentialBlockSize( - int *minGridSize, - int *blockSize, - T func, - size_t dynamicSMemSize = 0, - int blockSizeLimit = 0, - unsigned int flags = 0 - ){ - cudaError_t cerror; - cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags); - return hipCUDAErrorTohipError(cerror); -} - inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( int *numBlocks, const void* func, @@ -458,7 +494,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion) return hipCUDAErrorTohipError(err); } - inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice ) { return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice)); @@ -474,6 +509,16 @@ inline static hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags )); } +inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx ) +{ + return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx )); +} + +inline static hipError_t hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int flags ) +{ + return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags )); +} + inline static hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count ) { return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count )); @@ -499,12 +544,145 @@ inline static hipError_t hipEventQuery(hipEvent_t event) return hipCUDAErrorTohipError(cudaEventQuery(event)); } +inline static hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) +{ + return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device )); +} + +inline static hipError_t hipCtxDestroy(hipCtx_t ctx) +{ + return hipCUResultTohipError(cuCtxDestroy ( ctx )); +} + +inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) +{ + return hipCUResultTohipError(cuCtxPopCurrent ( ctx )); +} + +inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) +{ + return hipCUResultTohipError(cuCtxPushCurrent ( ctx )); +} + +inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) +{ + return hipCUResultTohipError(cuCtxSetCurrent ( ctx )); +} + +inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) +{ + return hipCUResultTohipError(cuCtxGetCurrent ( ctx )); +} + +inline static hipError_t hipCtxGetDevice(hipDevice_t *device) +{ + return hipCUResultTohipError(cuCtxGetDevice ( device )); +} + +inline static hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion) +{ + return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion )); +} + +inline static hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ) +{ + return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig )); +} + +inline static hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ) +{ + return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig )); +} + +inline static hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ) +{ + return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config )); +} + +inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) +{ + return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig )); +} + +inline static hipError_t hipCtxSynchronize ( void ) +{ + return hipCUResultTohipError(cuCtxSynchronize ( )); +} + +inline static hipError_t hipCtxGetFlags ( unsigned int* flags ) +{ + return hipCUResultTohipError(cuCtxGetFlags ( flags )); +} + +inline static hipError_t hipCtxDetach(hipCtx_t ctx) +{ + return hipCUResultTohipError(cuCtxDetach(ctx)); +} + +inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal) +{ + return hipCUResultTohipError(cuDeviceGet(device, ordinal)); +} + +inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname) +{ + return hipCUResultTohipError(cuModuleLoad(module, fname)); +} + +inline static hipError_t hipModuleUnload(hipModule_t hmod) +{ + return hipCUResultTohipError(cuModuleUnload(hmod)); +} + +inline static hipError_t hipModuleGetFunction(hipFunction_t *function, + hipModule_t module, const char *kname) +{ + return hipCUResultTohipError(cuModuleGetFunction(function, module, kname)); +} + +inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, + hipModule_t hmod, const char* name) +{ + return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name)); +} + +inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image) +{ + return hipCUResultTohipError(cuModuleLoadData(module, image)); +} + +inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, + unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, hipStream_t stream, + void **kernelParams, void **extra) +{ + return hipCUResultTohipError(cuLaunchKernel(f, + gridDimX, gridDimY, gridDimZ, + blockDimX, blockDimY, blockDimZ, + sharedMemBytes, stream, kernelParams, extra)); +} + #ifdef __cplusplus } #endif #ifdef __CUDACC__ +template +inline static hipError_t hipOccupancyMaxPotentialBlockSize( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, + unsigned int flags = 0 + ){ + cudaError_t cerror; + cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags); + return hipCUDAErrorTohipError(cerror); +} + template inline static hipError_t hipBindTexture(size_t *offset, const struct texture &tex, diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile new file mode 100644 index 0000000000..25ae7b8411 --- /dev/null +++ b/samples/0_Intro/module_api/Makefile @@ -0,0 +1,21 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) + +ifeq (${HIP_PLATFORM}, hcc) + GENCODEOBJECT_FLAGS=--target-isa-fiji +endif + +all: vcpy_isa.compile runKernel.hip.out + +vcpy_isa.compile: vcpy_isa.cpp + $(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co + +runKernel.hip.out: runKernel.cpp + $(HIPCC) runKernel.cpp -o runKernel.hip.out + +clean: + rm -f *.co *.out diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp new file mode 100644 index 0000000000..e4fa1b6d93 --- /dev/null +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -0,0 +1,105 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#ifdef __HIP_PLATFORM_HCC__ +#define fileName "vcpy_isa.co" +#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf" +#endif + +#ifdef __HIP_PLATFORM_NVCC__ +#define fileName "vcpy_isa.ptx" +#define kernel_name "hello_world" +#endif + +int main(){ + float *A, *B; + hipDeviceptr_t Ad, Bd; + A = new float[LEN]; + B = new float[LEN]; + + for(uint32_t i=0;iargBuffer(5); + uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; + memcpy(ptr32_t + 0, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 1, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 2, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 3, &len, sizeof(uint32_t)); + memcpy(ptr32_t + 4, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 5, &one, sizeof(uint32_t)); + memcpy(&argBuffer[3], &Ad, sizeof(void*)); + memcpy(&argBuffer[4], &Bd, sizeof(void*)); + + + + size_t size = argBuffer.size()*sizeof(void*); + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); + + hipMemcpyDtoH(B, Bd, SIZE); + for(uint32_t i=LEN-4;i + +__global__ void hello_world(hipLaunchParm lp, float *a, float *b) +{ + int tx = hipThreadIdx_x; + b[tx] = a[tx]; +} + +int main(){} diff --git a/samples/0_Intro/module_api/vcpy_isa.cu b/samples/0_Intro/module_api/vcpy_isa.cu new file mode 100644 index 0000000000..d2a0838604 --- /dev/null +++ b/samples/0_Intro/module_api/vcpy_isa.cu @@ -0,0 +1,6 @@ + +extern "C" __global__ void hello_world(float *a, float *b) +{ + int tx = threadIdx.x; + b[tx] = a[tx]; +} diff --git a/samples/0_Intro/module_api/vcpy_isa.ptx b/samples/0_Intro/module_api/vcpy_isa.ptx new file mode 100644 index 0000000000..62eb3f63df --- /dev/null +++ b/samples/0_Intro/module_api/vcpy_isa.ptx @@ -0,0 +1,38 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-19856038 +// Cuda compilation tools, release 7.5, V7.5.17 +// Based on LLVM 3.4svn +// + +.version 4.3 +.target sm_20 +.address_size 64 + + // .globl hello_world + +.visible .entry hello_world( + .param .u64 hello_world_param_0, + .param .u64 hello_world_param_1 +) +{ + .reg .f32 %f<2>; + .reg .b32 %r<2>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [hello_world_param_0]; + ld.param.u64 %rd2, [hello_world_param_1]; + cvta.to.global.u64 %rd3, %rd2; + cvta.to.global.u64 %rd4, %rd1; + mov.u32 %r1, %tid.x; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.f32 %f1, [%rd6]; + add.s64 %rd7, %rd3, %rd5; + st.global.f32 [%rd7], %f1; + ret; +} + + diff --git a/src/hip_context.cpp b/src/hip_context.cpp index ee9e37a1a1..e19c45d2c3 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -29,7 +29,22 @@ THE SOFTWARE. // Stack of contexts thread_local std::stack tls_ctxStack; +hipError_t ihipCtxStackUpdate() +{ + //HIP_INIT_API(); + hipError_t e = hipSuccess; + if(tls_ctxStack.empty()) { + tls_ctxStack.push(ihipGetTlsDefaultCtx()); + } + + return ihipLogStatus(e); +} + +/** + * @return #hipSuccess, #hipErrorInvalidValue + */ +//--- hipError_t hipInit(unsigned int flags) { HIP_INIT_API(flags); @@ -44,7 +59,10 @@ hipError_t hipInit(unsigned int flags) return ihipLogStatus(e); } - +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) { HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init @@ -57,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) return ihipLogStatus(e); } - +/** + * @return #hipSuccess, #hipErrorInvalidDevice + */ +//--- hipError_t hipDeviceGet(hipDevice_t *device, int deviceId) { HIP_INIT_API(device, deviceId); // FIXME - review if we want to init @@ -88,20 +109,39 @@ hipError_t hipDriverGetVersion(int *driverVersion) return ihipLogStatus(hipSuccess); } +/** + * @return #hipSuccess, #hipErrorInvalidValue + */ +//--- hipError_t hipCtxDestroy(hipCtx_t ctx) { + HIP_INIT_API(ctx); hipError_t e = hipSuccess; ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx(); - if(currentCtx == ctx) { - //need to destroy the ctx associated with calling thread - tls_ctxStack.pop(); + ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx; + if(primaryCtx== ctx) + { + e = hipErrorInvalidValue; } - delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED. + else + { + if(currentCtx == ctx) { + //need to destroy the ctx associated with calling thread + tls_ctxStack.pop(); + } + delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED. + } + return ihipLogStatus(e); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxPopCurrent(hipCtx_t* ctx) { + HIP_INIT_API(ctx); hipError_t e = hipSuccess; ihipCtx_t* tempCtx; *ctx = ihipGetTlsDefaultCtx(); @@ -119,8 +159,13 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx) return ihipLogStatus(e); } +/** + * @return #hipSuccess, #hipErrorInvalidContext + */ +//--- hipError_t hipCtxPushCurrent(hipCtx_t ctx) { + HIP_INIT_API(ctx); hipError_t e = hipSuccess; if(ctx != NULL) { //TODO- is this check needed? ihipSetTlsDefaultCtx(ctx); @@ -132,19 +177,30 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx) return ihipLogStatus(e); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxGetCurrent(hipCtx_t* ctx) { + HIP_INIT_API(ctx); hipError_t e = hipSuccess; - - *ctx = ihipGetTlsDefaultCtx(); - if(*ctx == nullptr) { - *ctx = NULL; //TODO - is it required? Can return nullptr? + if(!tls_ctxStack.empty()) { + *ctx= tls_ctxStack.top(); + } + else { + *ctx = NULL; } return ihipLogStatus(e); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxSetCurrent(hipCtx_t ctx) { + HIP_INIT_API(ctx); hipError_t e = hipSuccess; if(ctx == NULL) { tls_ctxStack.pop(); @@ -156,8 +212,13 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx) return ihipLogStatus(e); } +/** + * @return #hipSuccess, #hipErrorInvalidContext + */ +//--- hipError_t hipCtxGetDevice(hipDevice_t *device) { + HIP_INIT_API(device); hipError_t e = hipSuccess; ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); @@ -171,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device) return ihipLogStatus(e); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion) { HIP_INIT_API(apiVersion); @@ -182,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion) return ihipLogStatus(hipSuccess); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ) { HIP_INIT_API(cacheConfig); @@ -191,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ) return ihipLogStatus(hipSuccess); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ) { HIP_INIT_API(cacheConfig); @@ -200,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ) return ihipLogStatus(hipSuccess); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ) { HIP_INIT_API(config); @@ -209,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ) return ihipLogStatus(hipSuccess); } +/** + * @return #hipSuccess + */ +//--- hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { HIP_INIT_API(pConfig); @@ -216,4 +297,28 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) *pConfig = hipSharedMemBankSizeFourByte; return ihipLogStatus(hipSuccess); -} \ No newline at end of file +} + +/** + * @return #hipSuccess + */ +//--- +hipError_t hipCtxSynchronize ( void ) +{ + HIP_INIT_API(1); + return ihipSynchronize(); //TODP Shall check validity of ctx? +} + +/** + * @return #hipSuccess + */ +//--- +hipError_t hipCtxGetFlags ( unsigned int* flags ) +{ + HIP_INIT_API(flags); + hipError_t e = hipSuccess; + ihipCtx_t* tempCtx; + tempCtx = ihipGetTlsDefaultCtx(); + *flags = tempCtx->_ctxFlags; + return ihipLogStatus(e); +} diff --git a/src/hip_device.cpp b/src/hip_device.cpp index e3d7fefa91..14338c11de 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -160,11 +160,8 @@ hipError_t hipSetDevice(int deviceId) */ hipError_t hipDeviceSynchronize(void) { - HIP_INIT_API(); - - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. - - return ihipLogStatus(hipSuccess); + HIP_INIT_API(1); + return ihipSynchronize(); } diff --git a/src/hip_event.cpp b/src/hip_event.cpp index ca30c3c62b..4324583690 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) // TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming if (flags == 0) { - ihipEvent_t *eh = event->_handle = new ihipEvent_t(); + ihipEvent_t *eh = new ihipEvent_t(); eh->_state = hipEventStatusCreated; eh->_stream = NULL; eh->_flags = flags; eh->_timestamp = 0; eh->_copySeqId = 0; + *event = eh; } else { e = hipErrorInvalidValue; } @@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { HIP_INIT_API(event, stream); - ihipEvent_t *eh = event._handle; + ihipEvent_t *eh = event; if (eh && eh->_state != hipEventStatusUnitialized) { eh->_stream = stream; @@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event) { HIP_INIT_API(event); - event._handle->_state = hipEventStatusUnitialized; + event->_state = hipEventStatusUnitialized; - delete event._handle; - event._handle = NULL; + delete event; + event = NULL; // TODO - examine return additional error codes return ihipLogStatus(hipSuccess); @@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event) { HIP_INIT_API(event); - ihipEvent_t *eh = event._handle; + ihipEvent_t *eh = event; if (eh) { if (eh->_state == hipEventStatusUnitialized) { @@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - ihipEvent_t *start_eh = start._handle; - ihipEvent_t *stop_eh = stop._handle; + ihipEvent_t *start_eh = start; + ihipEvent_t *stop_eh = stop; ihipSetTs(start); ihipSetTs(stop); @@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event) { HIP_INIT_API(event); - ihipEvent_t *eh = event._handle; + ihipEvent_t *eh = event; // TODO-stream - need to read state of signal here: The event may have become ready after recording.. // TODO-HCC - use get_hsa_signal here. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index f0d123b64a..ea8604b7c4 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -158,7 +158,12 @@ ihipCtx_t *ihipGetTlsDefaultCtx() return tls_defaultCtx; } +hipError_t ihipSynchronize(void) +{ + ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. + return ihipLogStatus(hipSuccess); +} //================================================================================================= // ihipSignal_t: @@ -482,6 +487,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t } +void ihipStream_t::launchModuleKernel(hsa_signal_t signal, + uint32_t blockDimX, + uint32_t blockDimY, + uint32_t blockDimZ, + uint32_t gridDimX, + uint32_t gridDimY, + uint32_t gridDimZ, + uint32_t sharedMemBytes, + void *kernarg, + size_t kernSize, + uint64_t kernel){ + hsa_status_t status; + void *kern; + hsa_amd_memory_pool_t *pool = reinterpret_cast(_av.get_hsa_kernarg_region()); + status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern); + status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern); + memcpy(kern, kernarg, kernSize); + hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue(); + const uint32_t queue_mask = Queue->size-1; + uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue); + hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]); + + dispatch_packet->completion_signal = signal; + dispatch_packet->workgroup_size_x = blockDimX; + dispatch_packet->workgroup_size_y = blockDimY; + dispatch_packet->workgroup_size_z = blockDimZ; + dispatch_packet->grid_size_x = blockDimX * gridDimX; + dispatch_packet->grid_size_y = blockDimY * gridDimY; + dispatch_packet->grid_size_z = blockDimZ * gridDimZ; + dispatch_packet->group_segment_size = 0; + dispatch_packet->private_segment_size = sharedMemBytes; + dispatch_packet->kernarg_address = kern; + dispatch_packet->kernel_object = kernel; + uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + + uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + uint32_t header32 = header | (setup << 16); + + __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE); + + hsa_queue_store_write_index_relaxed(Queue, packet_index + 1); + hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index); +} + //============================================================================= // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted. @@ -1260,13 +1312,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } } +void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) +{ + std::string streamString = ToString(stream); + fprintf(stderr, KGRN "<grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z, + lp->dynamic_group_mem_bytes, streamString.c_str());\ +} // TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp) { - HIP_INIT_API(stream, grid, block, lp); + HIP_INIT(); stream = ihipSyncAndResolveStream(stream); #if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid.x; @@ -1439,7 +1498,7 @@ const char *ihipErrorString(hipError_t hip_error) void ihipSetTs(hipEvent_t e) { - ihipEvent_t *eh = e._handle; + ihipEvent_t *eh = e; if (eh->_state == hipEventStatusRecorded) { // already recorded, done: return; @@ -1509,7 +1568,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, } -void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind) +void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { ihipCtx_t *ctx = this->getCtx(); const ihipDevice_t *device = ctx->getDevice(); @@ -1528,7 +1587,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const bool dstInDeviceMem = dstPtrInfo._isInDeviceMem; // Resolve default to a specific Kind so we know which algorithm to use: - if (kind == hipMemcpyDefault) { + if (kind == hipMemcpyDefault && resolveOn) { kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem); }; @@ -1699,14 +1758,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const // Sync copy that acquires lock: -void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind) +void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { LockedAccessor_StreamCrit_t crit (_criticalData); - copySync(crit, dst, src, sizeBytes, kind); + copySync(crit, dst, src, sizeBytes, kind, resolveOn); } - - void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { LockedAccessor_StreamCrit_t crit(_criticalData); diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index c0282dc372..a64ad94e81 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind e = ex._code; } + return ihipLogStatus(e); +} +hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes) +{ + HIP_INIT_API(dst, src, sizeBytes); + + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); + + hc::completion_future marker; + + hipError_t e = hipSuccess; + + try { + + stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false); + } + catch (ihipException ex) { + e = ex._code; + } return ihipLogStatus(e); } +hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes) +{ + HIP_INIT_API(dst, src, sizeBytes); + + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); + + hc::completion_future marker; + + hipError_t e = hipSuccess; + + try { + + stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false); + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + +hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes) +{ + HIP_INIT_API(dst, src, sizeBytes); + + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); + + hc::completion_future marker; + + hipError_t e = hipSuccess; + + try { + + stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false); + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + +hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) +{ + HIP_INIT_API(dst, src, sizeBytes); + + hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); + + hc::completion_future marker; + + hipError_t e = hipSuccess; + + try { + + stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false); + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + + + /** * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, * @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument. @@ -786,7 +870,6 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) // TODO - replace with kernel-level for reporting free memory: size_t deviceMemSize, hostMemSize, userMemSize; hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize); - printf ("deviceMemSize=%zu\n", deviceMemSize); *free = device->_props.totalGlobalMem - deviceMemSize; } diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 1421eb0329..e43cc62829 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -20,137 +20,188 @@ THE SOFTWARE. #include "hip_runtime.h" #include "hsa/hsa.h" #include "hsa/hsa_ext_amd.h" +#include "hsa/amd_hsa_kernel_code.h" #include "hcc_detail/hip_hcc.h" #include "hcc_detail/trace_helper.h" #include +#include +#include +#include //TODO Use Pool APIs from HCC to get memory regions. namespace hipdrv{ -hsa_status_t findSystemRegions(hsa_region_t region, void *data){ - hsa_region_segment_t segment_id; - hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); - if(segment_id != HSA_REGION_SEGMENT_GLOBAL){ - return HSA_STATUS_SUCCESS; - } + hsa_status_t findSystemRegions(hsa_region_t region, void *data){ + hsa_region_segment_t segment_id; + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); - hsa_region_global_flag_t flags; - hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if(segment_id != HSA_REGION_SEGMENT_GLOBAL){ + return HSA_STATUS_SUCCESS; + } - hsa_region_t *reg = (hsa_region_t*)data; + hsa_region_global_flag_t flags; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){ - *reg = region; - } + hsa_region_t *reg = (hsa_region_t*)data; - return HSA_STATUS_SUCCESS; + if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){ + *reg = region; + } + + return HSA_STATUS_SUCCESS; + } + +} // End namespace hipdrv + +uint64_t PrintSymbolSizes(const void *emi, const char *name){ + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + if(NULL == ehdr || EV_CURRENT != ehdr->e_version){} + const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + for(uint16_t i=0;ie_shnum;++i){ + if(shdr[i].sh_type == SHT_SYMTAB){ + const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset); + assert(syms); + uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize; + const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset); + assert(strtab); + for(uint64_t i=0;ie_shoff); - if(segment_id != HSA_REGION_SEGMENT_GLOBAL){ - return HSA_STATUS_SUCCESS; - } + uint64_t max_offset = ehdr->e_shoff; + uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; - hsa_region_global_flag_t flags; - hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - - hsa_region_t *reg = (hsa_region_t*)data; - - if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){ - *reg = region; - } - - return HSA_STATUS_SUCCESS; + for(uint16_t i=0;i < ehdr->e_shnum;++i){ + uint64_t cur_offset = static_cast(shdr[i].sh_offset); + if(max_offset < cur_offset){ + max_offset = cur_offset; + total_size = max_offset; + if(SHT_NOBITS != shdr[i].sh_type){ + total_size += static_cast(shdr[i].sh_size); + } + } + } + return total_size; } - -} - - - -hipError_t hipModuleLoad(hipModule *module, const char *fname){ +hipError_t hipModuleLoad(hipModule_t *module, const char *fname){ HIP_INIT_API(fname); hipError_t ret = hipSuccess; + *module = new ihipModule_t; + if(module == NULL){ - ret = hipErrorInvalidValue; + return hipErrorInvalidValue; } + auto ctx = ihipGetTlsDefaultCtx(); if(ctx == nullptr){ ret = hipErrorInvalidContext; + }else{ int deviceId = ctx->getDevice()->_deviceId; ihipDevice_t *currentDevice = ihipGetDevice(deviceId); std::ifstream in(fname, std::ios::binary | std::ios::ate); + if(!in){ return hipErrorFileNotFound; + }else{ + + *module = new ihipModule_t; size_t size = std::string::size_type(in.tellg()); void *p = NULL; hsa_agent_t agent = currentDevice->_hsaAgent; hsa_region_t sysRegion; hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion); status = hsa_memory_allocate(sysRegion, size, (void**)&p); + if(status != HSA_STATUS_SUCCESS){ return hipErrorOutOfMemory; } + char *ptr = (char*)p; if(!ptr){ return hipErrorOutOfMemory; - std::cout<<"Error: failed to allocate memory for code object"<ptr = p; + (*module)->size = size; in.seekg(0, std::ios::beg); std::copy(std::istreambuf_iterator(in), std::istreambuf_iterator(), ptr); - status = hsa_code_object_deserialize(ptr, size, NULL, &obj); + + status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object); + if(status != HSA_STATUS_SUCCESS){ return hipErrorSharedObjectInitFailed; } - *module = obj.handle; + + status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable); + if(status != HSA_STATUS_SUCCESS){ + return hipErrorNotInitialized; + } } } + return ret; } -hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){ - HIP_INIT_API(name); +hipError_t hipModuleUnload(hipModule_t hmod){ + hipError_t ret = hipSuccess; + hsa_status_t status = hsa_executable_destroy(hmod->executable); + if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; } + status = hsa_code_object_destroy(hmod->object); + if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; } + delete hmod; + return ret; +} + +hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){ auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; - if(name == nullptr || hmod == 0){ + + if(name == nullptr){ return hipErrorInvalidValue; } + if(ctx == nullptr){ ret = hipErrorInvalidContext; + }else{ + *func = new ihipFunction_t; int deviceId = ctx->getDevice()->_deviceId; ihipDevice_t *currentDevice = ihipGetDevice(deviceId); hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent; hsa_status_t status; - hsa_executable_symbol_t kernel_symbol; - hsa_executable_t executable; - status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable); + status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL); if(status != HSA_STATUS_SUCCESS){ return hipErrorNotInitialized; } - hsa_code_object_t obj; - obj.handle = hmod; - status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL); - if(status != HSA_STATUS_SUCCESS){ - return hipErrorNotInitialized; - } - status = hsa_executable_freeze(executable, NULL); - status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol); + + status = hsa_executable_freeze(hmod->executable, NULL); + status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol); if(status != HSA_STATUS_SUCCESS){ return hipErrorNotFound; } - status = hsa_executable_symbol_get_info(kernel_symbol, + + status = hsa_executable_symbol_get_info((*func)->kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - func); + &(*func)->kernel); + if(status != HSA_STATUS_SUCCESS){ return hipErrorNotFound; } @@ -158,16 +209,24 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n return ret; } -hipError_t hipLaunchModuleKernel(hipFunction f, +hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, + const char *name){ + HIP_INIT_API(name); + return ihipModuleGetFunction(hfunc, hmod, name); +} + +hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra){ - HIP_INIT_API(f); + HIP_INIT_API(f->kernel); auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; + if(ctx == nullptr){ ret = hipErrorInvalidDevice; + }else{ int deviceId = ctx->getDevice()->_deviceId; ihipDevice_t *currentDevice = ihipGetDevice(deviceId); @@ -189,66 +248,100 @@ hipError_t hipLaunchModuleKernel(hipFunction f, /* Kernel argument preparation. */ - - hsa_region_t kernArg; - hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg); - void *kern; - status = hsa_memory_allocate(kernArg, kernSize, &kern); - if(status != HSA_STATUS_SUCCESS){ - return hipErrorLaunchOutOfResources; - } - memcpy(kern, config[1], kernSize); - + hsa_status_t status; + grid_launch_parm lp; + hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp); /* -Pre kernel launch - - stream = ihipSyncAndResolveStream(stream); - stream->lockopen_preKernelCommand(); - hc::accelerator_view av = &stream->_av; - hc::completion_future cf = new hc::completion_future; + Create signal */ - hStream = ihipSyncAndResolveStream(hStream); - hc::accelerator_view *av = &hStream->_av; - hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue(); hsa_signal_t signal; status = hsa_signal_create(1, 0, NULL, &signal); /* -Creating the packets + Launch AQL packet +*/ + hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ, + gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel); + +/* + Wait for signal */ - const uint32_t queue_mask = Queue->size-1; - uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue); - hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]); - - dispatch_packet->completion_signal = signal; - dispatch_packet->workgroup_size_x = blockDimX; - dispatch_packet->workgroup_size_y = blockDimY; - dispatch_packet->workgroup_size_z = blockDimZ; - dispatch_packet->grid_size_x = blockDimX * gridDimX; - dispatch_packet->grid_size_y = blockDimY * gridDimY; - dispatch_packet->grid_size_z = blockDimZ * gridDimZ; - - dispatch_packet->group_segment_size = 0; - dispatch_packet->private_segment_size = sharedMemBytes; - dispatch_packet->kernarg_address = kern; - dispatch_packet->kernel_object = f; - uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - - - uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - uint32_t header32 = header | (setup << 16); - - __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE); - - hsa_queue_store_write_index_relaxed(Queue, packet_index+1); - hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index); hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + + ihipPostLaunchKernel(hStream, lp); + + } + + return ret; +} + + +hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, + hipModule_t hmod, const char* name){ + HIP_INIT_API(name); + hipError_t ret = hipSuccess; + if(dptr == NULL || bytes == NULL){ + return hipErrorInvalidValue; + } + if(name == NULL || hmod == NULL){ + return hipErrorNotInitialized; + } + else{ + hipFunction_t func; + ihipModuleGetFunction(&func, hmod, name); + *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t); + *dptr = reinterpret_cast(func->kernel); + return ret; + } +} + +hipError_t hipModuleLoadData(hipModule_t *module, const void *image){ + HIP_INIT_API(image); + hipError_t ret = hipSuccess; + if(image == NULL || module == NULL){ + return hipErrorNotInitialized; + }else{ + auto ctx = ihipGetTlsDefaultCtx(); + *module = new ihipModule_t; + int deviceId = ctx->getDevice()->_deviceId; + ihipDevice_t *currentDevice = ihipGetDevice(deviceId); + + void *p; + uint64_t size = ElfSize(image); + hsa_agent_t agent = currentDevice->_hsaAgent; + hsa_region_t sysRegion; + hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion); + status = hsa_memory_allocate(sysRegion, size, (void**)&p); + + if(status != HSA_STATUS_SUCCESS){ + return hipErrorOutOfMemory; + } + + char *ptr = (char*)p; + if(!ptr){ + return hipErrorOutOfMemory; + } + (*module)->ptr = p; + (*module)->size = size; + + memcpy(ptr, image, size); + + status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object); + + if(status != HSA_STATUS_SUCCESS){ + return hipErrorSharedObjectInitFailed; + } + + status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable); + if(status != HSA_STATUS_SUCCESS){ + return hipErrorNotInitialized; + } } return ret; } + + diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index fd51599815..8f75d99918 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_ //--- // Disable visibility of this device into memory allocated on peer device. // Remove this device from peer device peerlist. -hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx) +hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) { HIP_INIT_API(peerCtx); @@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx) //--- // Allow the current device to see all memory allocated on peerDevice. // This should add this device to the peer-device peer list. -hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) +hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) { HIP_INIT_API(peerCtx, flags); @@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId) { HIP_INIT_API(peerDeviceId); - return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId)); + return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId)); } @@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) { HIP_INIT_API(peerDeviceId, flags); - return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags); + return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags); } @@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream); } +hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) +{ + HIP_INIT_API(peerCtx, flags); + return ihipEnablePeerAccess(peerCtx, flags); +} +hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx) +{ + HIP_INIT_API(peerCtx); + return ihipDisablePeerAccess(peerCtx); +} diff --git a/tests/src/hipComplex.cpp b/tests/src/hipComplex.cpp new file mode 100644 index 0000000000..2e9c1884a1 --- /dev/null +++ b/tests/src/hipComplex.cpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +#include +#include +#include +#include + +#define LEN 64 +#define SIZE 64<<2 + +__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx])); +} + +int main(){ + float *A, *Ad, *B, *Bd, *C, *Cd; + A = new float[LEN]; + B = new float[LEN]; + C = new float[LEN]; + for(uint32_t i=0;i +#include +#include + +#define LEN 1030 +#define SIZE LEN << 2 + +__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald) +{ + memcpy(Out, In, SIZE, Vald); +} + +__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) +{ + memset(ptr, val, size); +} + +int main() +{ + uint32_t *A, *Ad, *B, *Bd; + uint32_t *Val, *Vald; + A = new uint32_t[LEN]; + B = new uint32_t[LEN]; + Val = new uint32_t; + *Val = 0; + for(int i=0;i +#include +#include + +#define LEN 1024 +#define SIZE LEN<<2 + +int main(){ + int *A, *B, *C; + hipDeviceptr Ad, Bd; + A = new int[LEN]; + B = new int[LEN]; + C = new int[LEN]; + for(int i=0;i +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#define fileName "vcpy_isa.co" +#define kernel_name "hello_world" + +__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){ + int tx = hipThreadIdx_x; + Bd[tx] = Ad[tx]; +} + +int main(){ + float *A, *B, *Ad, *Bd; + A = new float[LEN]; + B = new float[LEN]; + + for(uint32_t i=0;iargBuffer(2); + memcpy(&argBuffer[0], &Ad, sizeof(void*)); + memcpy(&argBuffer[1], &Bd, sizeof(void*)); + + size_t size = argBuffer.size()*sizeof(void*); + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); + + hipStreamDestroy(stream); + + hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); + + for(uint32_t i=0;i +#include +#include + +#define fileName "vcpy_isa.co" + +int main(){ + hipModule_t module; + hipModuleLoad(&module, fileName); + hipModuleUnload(module); +} + diff --git a/tests/src/sampleModule.cpp b/tests/src/sampleModule.cpp new file mode 100644 index 0000000000..606b19717d --- /dev/null +++ b/tests/src/sampleModule.cpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#ifdef __HIP_PLATFORM_HCC__ +#define fileName "vcpy_isa.co" +#endif + +#ifdef __HIP_PLATFORM_NVCC__ +#define fileName "vcpy_isa.ptx" +#endif + +#define kernel_name "hello_world" + +int main(){ + float *A, *B; + hipDeviceptr_t Ad, Bd; + A = new float[LEN]; + B = new float[LEN]; + + for(uint32_t i=0;iargBuffer(2); + memcpy(&argBuffer[0], &Ad, sizeof(void*)); + memcpy(&argBuffer[1], &Bd, sizeof(void*)); + + size_t size = argBuffer.size()*sizeof(void*); + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); + + hipMemcpyDtoH(B, Bd, SIZE); + for(uint32_t i=0;i; + .reg .b32 %r<2>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [hello_world_param_0]; + ld.param.u64 %rd2, [hello_world_param_1]; + cvta.to.global.u64 %rd3, %rd2; + cvta.to.global.u64 %rd4, %rd1; + mov.u32 %r1, %tid.x; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.f32 %f1, [%rd6]; + add.s64 %rd7, %rd3, %rd5; + st.global.f32 [%rd7], %f1; + ret; +} + +