diff --git a/hipamd/.clang-format b/hipamd/.clang-format index 1793af2ba2..5572a72cdd 100644 --- a/hipamd/.clang-format +++ b/hipamd/.clang-format @@ -1,20 +1,10 @@ ---- Language: Cpp BasedOnStyle: Google AlignEscapedNewlinesLeft: false +AlignOperands: false ColumnLimit: 100 +AlwaysBreakTemplateDeclarations: false DerivePointerAlignment: false -IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false MaxEmptyLinesToKeep: 2 SortIncludes: false -IndentWidth: 4 ---- -Language: ObjC -BasedOnStyle: Google -AlignEscapedNewlinesLeft: false -ColumnLimit: 100 -DerivePointerAlignment: false -IndentWrappedFunctionNames: false -MaxEmptyLinesToKeep: 2 -SortIncludes: false -IndentWidth: 4 diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 565ce37519..055543a245 100755 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -477,9 +477,6 @@ if(HIP_PLATFORM STREQUAL "hcc" OR HIP_PLATFORM STREQUAL "rocclr") install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) endif() -# Install .hipInfo -install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) - # Install .hipVersion install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 9c386d34b6..4ae49b6c12 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -426,6 +426,7 @@ foreach $arg (@ARGV) # TODO: why are we removing it here? $trimarg =~ s/^\s+|\s+$//g; # Remive whitespace my $swallowArg = 0; + my $escapeArg = 1; if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') { $compileOnly = 1; $needLDFLAGS = 0; @@ -578,6 +579,7 @@ foreach $arg (@ARGV) close $in; close $out; $arg = "$new_arg -Wl,\@$new_file"; + $escapeArg = 0; } elsif (($arg =~ m/\.a$/ || $arg =~ m/\.lo$/) && $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'clang') { ## process static library for hip-clang @@ -624,6 +626,7 @@ foreach $arg (@ARGV) $new_arg .= " $tmpdir/$libBaseName"; } $arg = "$new_arg"; + $escapeArg = 0; if ($toolArgs =~ m/-Xlinker$/) { $toolArgs = substr $toolArgs, 0, -8; chomp $toolArgs; @@ -703,7 +706,7 @@ foreach $arg (@ARGV) # common characters such as alphanumerics. # Do the quoting here because sometimes the $arg is changed in the loop # Important to have all of '-Xlinker' in the set of unquoted characters. - if (not $isWindows) { # Windows needs different quoting, ignore for now + if (not $isWindows and $escapeArg) { # Windows needs different quoting, ignore for now $arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g; } $toolArgs .= " $arg" unless $swallowArg; @@ -798,9 +801,9 @@ if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'hcc') if ($needHipHcc) { if ($linkType eq 0) { - substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libhip_hcc_static.a " ; + substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libamdhip64.a " ; } else { - substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libhip_hcc.so "; + substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so "; } } @@ -839,7 +842,7 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { if ($linkType eq 0) { $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma "; } else { - $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc -lnuma "; + $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 "; } # To support __fp16 and _Float16, explicitly link with compiler-rt $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 " diff --git a/hipamd/bin/hipconfig b/hipamd/bin/hipconfig index b975a5b08a..8659669f5f 100755 --- a/hipamd/bin/hipconfig +++ b/hipamd/bin/hipconfig @@ -1,7 +1,7 @@ #!/usr/bin/perl -w $HIP_BASE_VERSION_MAJOR = "3"; -$HIP_BASE_VERSION_MINOR = "7"; +$HIP_BASE_VERSION_MINOR = "8"; # Need perl > 5.10 to use logic-defined or use 5.006; use v5.10.1; diff --git a/hipamd/cmake/FindHIP.cmake b/hipamd/cmake/FindHIP.cmake index 498b5e4570..b4a5cb239e 100644 --- a/hipamd/cmake/FindHIP.cmake +++ b/hipamd/cmake/FindHIP.cmake @@ -638,7 +638,11 @@ macro(HIP_ADD_EXECUTABLE hip_target) endif() set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") endif() - add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + if ("${_sources}" STREQUAL "") + add_executable(${hip_target} ${_cmake_options} ${_generated_files} "") + else() + add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + endif() set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP) endmacro() @@ -652,7 +656,11 @@ macro(HIP_ADD_LIBRARY hip_target) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) endif() - add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + if ("${_sources}" STREQUAL "") + add_library(${hip_target} ${_cmake_options} ${_generated_files} "") + else() + add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + endif() set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() diff --git a/hipamd/docs/markdown/hip_deprecated_api_list.md b/hipamd/docs/markdown/hip_deprecated_api_list.md index b9e895d402..783ea4a390 100644 --- a/hipamd/docs/markdown/hip_deprecated_api_list.md +++ b/hipamd/docs/markdown/hip_deprecated_api_list.md @@ -4,8 +4,6 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. HIP initially added limited support for these API to facilitate easy porting from existing driver codes. These API are marked as deprecated now since there are better alternate interface (such as hipSetDevice or the stream API) to achieve the required functions. -### hipCtxCreate -### hipCtxDestroy ### hipCtxPopCurrent ### hipCtxPushCurrent ### hipCtxSetCurrent @@ -21,7 +19,7 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as ### hipCtxEnablePeerAccess ### hipCtxDisablePeerAccess -## HIP Management APIs +## HIP Memory Management APIs ### hipMallocHost Should use "hipHostMalloc" instead. diff --git a/hipamd/docs/markdown/hip_porting_guide.md b/hipamd/docs/markdown/hip_porting_guide.md index 9806d841a5..23e5058061 100644 --- a/hipamd/docs/markdown/hip_porting_guide.md +++ b/hipamd/docs/markdown/hip_porting_guide.md @@ -14,7 +14,7 @@ and provides practical suggestions on how to port CUDA code and work through com * [CUDA to HIP Math Library Equivalents](#library-equivalents) - [Distinguishing Compiler Modes](#distinguishing-compiler-modes) * [Identifying HIP Target Platform](#identifying-hip-target-platform) - * [Identifying the Compiler: hcc, hip-clang, or nvcc](#identifying-the-compiler-hcc-hip-clang-or-nvcc) + * [Identifying the Compiler: hip-clang, or nvcc](#identifying-the-compiler-hip-clang-or-nvcc) * [Identifying Current Compilation Pass: Host or Device](#identifying-current-compilation-pass-host-or-device) * [Compiler Defines: Summary](#compiler-defines-summary) - [Identifying Architecture Features](#identifying-architecture-features) @@ -41,12 +41,10 @@ and provides practical suggestions on how to port CUDA code and work through com - [threadfence_system](#threadfence_system) * [Textures and Cache Control](#textures-and-cache-control) - [More Tips](#more-tips) - * [HIPTRACE Mode](#hiptrace-mode) - * [Environment Variables](#environment-variables) + * [HIP Logging](#hip-logging) * [Debugging hipcc](#debugging-hipcc) * [What Does This Error Mean?](#what-does-this-error-mean) + [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr) - * [HIP Environment Variables](#hip-environment-variables) * [Editor Highlighting](#editor-highlighting) @@ -163,17 +161,19 @@ Many projects use a mixture of an accelerator compiler (AMD or NVIDIA) and a sta -### Identifying the Compiler: hcc, hip-clang or nvcc -Often, it's useful to know whether the underlying compiler is hcc, HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. - +### Identifying the Compiler: hip-clang or nvcc +Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. ``` -#ifdef __HCC__ -// Compiled with hcc +#ifdef __HIP_PLATFORM_HCC__ +// Compiled with HIP-Clang ``` + ``` -#ifdef __HIP__ +#if defined(__HCC__) || (defined(__clang__) && defined(__HIP__)) +#define __HIP_PLATFORM_HCC__ +#endif // Compiled with HIP-Clang ``` @@ -198,7 +198,7 @@ Compiler directly generates the host code (using the Clang x86 target) and passe nvcc makes two passes over the code: one for host code and one for device code. HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code. -`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (hcc, HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define. +`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define. ``` // #ifdef __CUDA_ARCH__ @@ -209,24 +209,21 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, an ### Compiler Defines: Summary -|Define | hcc | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.) -|--- | --- | --- | --- |---| +|Define | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.) +|--- | --- | --- |---| |HIP-related defines:| -|`__HIP_PLATFORM_HCC__`| Defined | Defined | Undefined | Defined if targeting hcc platform; undefined otherwise | -|`__HIP_PLATFORM_NVCC__`| Undefined | Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise | -|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined -|`__HIPCC__` | Defined | Defined | Defined | Undefined -|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 +|`__HIP_PLATFORM_HCC__`| Defined | Undefined | Defined if targeting AMD platform; undefined otherwise | +|`__HIP_PLATFORM_NVCC__`| Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise | +|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined +|`__HIPCC__` | Defined | Defined | Undefined +|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 |nvcc-related defines:| -|`__CUDACC__` | Undefined | Undefined | Defined if source code is compiled by nvcc; undefined otherwise | Undefined -|`__NVCC__` | Undefined | Undefined | Defined | Undefined -|`__CUDA_ARCH__` | Undefined | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined -|hcc-related defines:| -|`__HCC__` | Defined | Undefined | Undefined | Undefined -|`__HCC_ACCELERATOR__` | Nonzero if in device code; otherwise undefined | Undefined | Undefined | Undefined +|`__CUDACC__` | Defined if source code is compiled by nvcc; undefined otherwise | Undefined +|`__NVCC__` | Undefined | Defined | Undefined +|`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined |hip-clang-related defines:| -|`__HIP__` | Undefined | Defined | Undefined | Undefined -|hcc/HIP-Clang common defines:| +|`__HIP__` | Defined | Undefined | Undefined +|HIP-Clang common defines:| |`__clang__` | Defined | Defined | Undefined | Defined if using Clang; otherwise undefined ## Identifying Architecture Features @@ -274,23 +271,23 @@ The table below shows the full set of architectural properties that HIP supports |`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | hasSharedInt32Atomics |32-bit integer atomics for shared memory |`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | hasSharedFloatAtomicExch |32-bit float atomic exchange for shared memory |`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory -|64-bit atomics: | | -|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory +|64-bit atomics: | | +|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory |`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | hasSharedInt64Atomics |64-bit integer atomics for shared memory |Doubles: | | -|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point -|Warp cross-lane operations: | | -|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all) -|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions -|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*) +|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point +|Warp cross-lane operations: | | +|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all) +|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions +|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*) |`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | hasFunnelShift |Funnel shift two input words into one |Sync: | | |`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | hasThreadFenceSystem |threadfence\_system -|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or -|Miscellaneous: | | -|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs | -|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D -|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism | +|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or +|Miscellaneous: | | +|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs | +|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D +|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism | ## Finding HIP @@ -498,19 +495,15 @@ int main() std::cout<<"Passed"< +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture(size_t* offset, struct texture& tex, const void* devPtr, const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) { return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &desc, size, &tex); @@ -3863,6 +3892,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture& tex, **/ #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture(size_t* offset, struct texture& tex, const void* devPtr, size_t size = UINT_MAX) { return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &(tex.channelDesc), size, &tex); @@ -3871,6 +3901,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture& tex, // C API #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, textureReference* tex, const void* devPtr, const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch); @@ -3884,6 +3915,7 @@ hipError_t ihipBindTexture2DImpl(int dim, enum hipTextureReadMode readMode, size #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, struct texture& tex, const void* devPtr, size_t width, size_t height, size_t pitch) { return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), width, height, @@ -3893,6 +3925,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture& te #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, struct texture& tex, const void* devPtr, const struct hipChannelFormatDesc& desc, size_t width, size_t height, size_t pitch) { @@ -3902,6 +3935,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture& te // C API #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(textureReference* tex, hipArray_const_t array, const hipChannelFormatDesc* desc); #endif @@ -3915,6 +3949,7 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls, int dim, enum hipTextureRead #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(struct texture& tex, hipArray_const_t array) { return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, tex.channelDesc, &tex); } @@ -3922,6 +3957,7 @@ hipError_t hipBindTextureToArray(struct texture& tex, hipArray #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(struct texture& tex, hipArray_const_t array, const struct hipChannelFormatDesc& desc) { return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, desc, &tex); @@ -3930,6 +3966,7 @@ hipError_t hipBindTextureToArray(struct texture& tex, hipArray #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) inline static hipError_t hipBindTextureToArray(struct texture *tex, hipArray_const_t array, const struct hipChannelFormatDesc* desc) { @@ -3999,6 +4036,7 @@ inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchPara * @return #hipSuccess **/ #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(const textureReference* tex); #endif @@ -4008,6 +4046,7 @@ extern hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject) #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(struct texture& tex) { return ihipUnbindTextureImpl(tex.textureObject); } @@ -4015,7 +4054,10 @@ hipError_t hipUnbindTexture(struct texture& tex) { #if !__HIP_ROCclr__ hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array); + +DEPRECATED(DEPRECATED_MSG) hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref); + hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol); hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, @@ -4058,28 +4100,23 @@ hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipReso hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject); #if __HIP_ROCclr__ -template -static inline hipError_t hipBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - size_t size = UINT_MAX) -{ +template +DEPRECATED(DEPRECATED_MSG) +static inline hipError_t hipBindTexture(size_t* offset, const struct texture& tex, + const void* devPtr, size_t size = UINT_MAX) { return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size); } -template -static inline hipError_t hipBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - const struct hipChannelFormatDesc &desc, - size_t size = UINT_MAX) -{ +template +DEPRECATED(DEPRECATED_MSG) +static inline hipError_t + hipBindTexture(size_t* offset, const struct texture& tex, const void* devPtr, + const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) { return hipBindTexture(offset, &tex, devPtr, &desc, size); } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTexture2D( size_t *offset, const struct texture &tex, @@ -4092,6 +4129,7 @@ static inline hipError_t hipBindTexture2D( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTexture2D( size_t *offset, const struct texture &tex, @@ -4105,6 +4143,7 @@ static inline hipError_t hipBindTexture2D( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTextureToArray( const struct texture &tex, hipArray_const_t array) @@ -4115,6 +4154,7 @@ static inline hipError_t hipBindTextureToArray( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTextureToArray( const struct texture &tex, hipArray_const_t array, @@ -4148,6 +4188,7 @@ static inline hipError_t hipBindTextureToMipmappedArray( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipUnbindTexture( const struct texture &tex) { diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h old mode 100644 new mode 100755 index fe72f33d65..f9a2992cd1 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -37,6 +37,18 @@ extern "C" { #define __dparm(x) #endif +// Add Deprecated Support for CUDA Mapped HIP APIs +#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED) +#define __HIP_DEPRECATED +#elif defined(_MSC_VER) +#define __HIP_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __HIP_DEPRECATED __attribute__((deprecated)) +#else +#define __HIP_DEPRECATED +#endif + + // TODO -move to include/hip_runtime_api.h as a common implementation. /** * Memory copy types @@ -179,6 +191,7 @@ typedef enum cudaSharedMemConfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; +typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr; typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; typedef CUdeviceptr hipDeviceptr_t; @@ -962,14 +975,16 @@ inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_ height, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, - const void* src, size_t count, hipMemcpyKind kind) { +__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, + size_t hOffset, const void* src, + size_t count, hipMemcpyKind kind) { return hipCUDAErrorTohipError( cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, - size_t hOffset, size_t count, hipMemcpyKind kind) { +__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, + size_t wOffset, size_t hOffset, + size_t count, hipMemcpyKind kind) { return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count, hipMemcpyKindToCudaMemcpyKind(kind))); } @@ -1352,7 +1367,12 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut struct cudaPointerAttributes cPA; hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr)); if (err == hipSuccess) { - switch (cPA.memoryType) { +#if (CUDART_VERSION >= 11000) + auto memType = cPA.type; +#else + unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11 +#endif + switch (memType) { case cudaMemoryTypeDevice: attributes->memoryType = hipMemoryTypeDevice; break; @@ -1606,6 +1626,11 @@ inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t devic return hipCUResultTohipError(cuDeviceGetName(name, len, device)); } +inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, + int srcDevice, int dstDevice) { + return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice)); +} + inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) { return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device)); } @@ -1689,14 +1714,17 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); } -inline static hipError_t hipBindTexture(size_t* offset, struct textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t size __dparm(UINT_MAX)){ +__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset, + struct textureReference* tex, + const void* devPtr, + const hipChannelFormatDesc* desc, + size_t size __dparm(UINT_MAX)) { return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size)); } -inline static hipError_t hipBindTexture2D(size_t* offset, struct textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t width, size_t height, - size_t pitch) { +__HIP_DEPRECATED inline static hipError_t hipBindTexture2D( + size_t* offset, struct textureReference* tex, const void* devPtr, + const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) { return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch)); } @@ -1731,8 +1759,8 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject)); } -inline static hipError_t hipGetTextureAlignmentOffset(size_t* offset, const struct textureReference* texref) -{ +__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset( + size_t* offset, const struct textureReference* texref) { return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref)); } @@ -1805,32 +1833,32 @@ inline static hipError_t hipBindTexture(size_t* offset, struct texture -inline static hipError_t hipUnbindTexture(struct texture* tex) { +__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture* tex) { return hipCUDAErrorTohipError(cudaUnbindTexture(tex)); } template -inline static hipError_t hipUnbindTexture(struct texture &tex) { +__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture& tex) { return hipCUDAErrorTohipError(cudaUnbindTexture(tex)); } template -inline static hipError_t hipBindTextureToArray(struct texture& tex, - hipArray_const_t array, - const hipChannelFormatDesc& desc) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture& tex, hipArray_const_t array, + const hipChannelFormatDesc& desc) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc)); } template -inline static hipError_t hipBindTextureToArray(struct texture *tex, - hipArray_const_t array, - const hipChannelFormatDesc* desc) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture* tex, hipArray_const_t array, + const hipChannelFormatDesc* desc) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc)); } template -inline static hipError_t hipBindTextureToArray(struct texture& tex, - hipArray_const_t array) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture& tex, hipArray_const_t array) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array)); } diff --git a/hipamd/lpl_ca/pstreams/pstream.h b/hipamd/lpl_ca/pstreams/pstream.h index 363df0cf45..5b3afd7d35 100644 --- a/hipamd/lpl_ca/pstreams/pstream.h +++ b/hipamd/lpl_ca/pstreams/pstream.h @@ -70,12 +70,12 @@ struct pstreams { /// Type used for file descriptors. typedef int fd_type; - static const pmode pstdin = std::ios_base::out; ///< Write to stdin - static const pmode pstdout = std::ios_base::in; ///< Read from stdout - static const pmode pstderr = std::ios_base::app; ///< Read from stderr + static constexpr pmode pstdin = std::ios_base::out; ///< Write to stdin + static constexpr pmode pstdout = std::ios_base::in; ///< Read from stdout + static constexpr pmode pstderr = std::ios_base::app; ///< Read from stderr /// Create a new process group for the child process. - static const pmode newpg = std::ios_base::trunc; + static constexpr pmode newpg = std::ios_base::trunc; protected: enum { bufsz = 32 }; ///< Size of pstreambuf buffers. diff --git a/hipamd/packaging/hip-rocclr.txt b/hipamd/packaging/hip-rocclr.txt index 9cf89f042b..6f5c16bb96 100644 --- a/hipamd/packaging/hip-rocclr.txt +++ b/hipamd/packaging/hip-rocclr.txt @@ -5,8 +5,6 @@ if(@BUILD_SHARED_LIBS@) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_MAJOR@ DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_STRING@ DESTINATION lib) - install(FILES @PROJECT_BINARY_DIR@/lib/libhip_hcc.so DESTINATION lib) - install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc.so DESTINATION lib) else() install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.a DESTINATION lib) endif() @@ -45,7 +43,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") @@ -55,7 +53,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") +set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index 8cb79a5496..e6c4225984 100755 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -66,6 +66,14 @@ if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROC # message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n") endif() + +#APPEND default path for CMAKE_PREFIX_PATH +#User provided will be searched first since defualt path is at end. +#Custom install path can be provided at compile time as cmake parameter(-DCMAKE_PREFIX_PATH="") +#/opt/rocm:default:For amd_comgr,hsa-runtime64 +#/opt/rocm/llvm/:default:For llvm/clang pulled in as dependency from hsa/comgr +list( APPEND CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "/opt/rocm" "/opt/rocm/llvm") + list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" ) set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") @@ -194,6 +202,7 @@ else() endif() +set_target_properties(amdhip64 PROPERTIES LINK_FLAGS_RELEASE -s) set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR}) add_library(host INTERFACE) target_link_libraries(host INTERFACE hip::amdhip64) @@ -206,20 +215,12 @@ target_link_libraries(device INTERFACE host) # FIXME: Linux convention is to create static library with same base # filename. - if(${BUILD_SHARED_LIBS}) - target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl numa hsa-runtime64::hsa-runtime64) + target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64) INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) - INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so )" DESTINATION lib COMPONENT MAIN) - - INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhiprtc.so )" DESTINATION lib COMPONENT MAIN) - INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhip_hcc.so DESTINATION lib COMPONENT MAIN) - - INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION lib COMPONENT MAIN) - else() - target_link_libraries(amdhip64 PRIVATE Threads::Threads dl numa hsa-runtime64::hsa-runtime64 amd_comgr) + target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr) # combine objects of vid and hip into amdhip64_static add_custom_target( amdhip64_static_combiner diff --git a/hipamd/rocclr/hip_device.cpp b/hipamd/rocclr/hip_device.cpp index 2d8900cded..b116819846 100644 --- a/hipamd/rocclr/hip_device.cpp +++ b/hipamd/rocclr/hip_device.cpp @@ -92,8 +92,8 @@ hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device auto* deviceHandle = g_devices[device]->devices()[0]; const auto& info = deviceHandle->info(); - *major = info.gfxipVersion_ / 100; - *minor = info.gfxipVersion_ % 100; + *major = info.gfxipMajor_; + *minor = info.gfxipMinor_; HIP_RETURN(hipSuccess); } @@ -175,10 +175,10 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.maxGridSize[2] = INT32_MAX; deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000; deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000; - deviceProps.memoryBusWidth = info.globalMemChannels_ * 32; + deviceProps.memoryBusWidth = info.globalMemChannels_; deviceProps.totalConstMem = info.maxConstantBufferSize_; - deviceProps.major = info.gfxipVersion_ / 100; - deviceProps.minor = info.gfxipVersion_ % 100; + deviceProps.major = info.gfxipMajor_; + deviceProps.minor = info.gfxipMinor_; deviceProps.multiProcessorCount = info.maxComputeUnits_; deviceProps.l2CacheSize = info.l2CacheSize_; deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_; @@ -208,7 +208,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_; //deviceProps.isMultiGpuBoard = info.; deviceProps.canMapHostMemory = 1; - deviceProps.gcnArch = info.gfxipVersion_; + deviceProps.gcnArch = info.gfxipMajor_ * 100 + info.gfxipMinor_ * 10 + info.gfxipStepping_; sprintf(deviceProps.gcnArchName, "gfx%d%d%x", info.gfxipMajor_, info.gfxipMinor_, info.gfxipStepping_); deviceProps.cooperativeLaunch = info.cooperativeGroups_; deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_; diff --git a/hipamd/rocclr/hip_device_runtime.cpp b/hipamd/rocclr/hip_device_runtime.cpp old mode 100644 new mode 100755 index be979dab9e..470b088f02 --- a/hipamd/rocclr/hip_device_runtime.cpp +++ b/hipamd/rocclr/hip_device_runtime.cpp @@ -367,13 +367,6 @@ hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) { } } -/** -hipError_t hipDeviceGetP2PAttribute ( int* value, hipDeviceP2PAttr attr, int srcDevice, int dstDevice ) { - assert(0); - HIP_RETURN(hipSuccess); -} -**/ - hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int len, int device ) { HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device); diff --git a/hipamd/rocclr/hip_event.cpp b/hipamd/rocclr/hip_event.cpp index a9ea30e15c..858309d638 100644 --- a/hipamd/rocclr/hip_event.cpp +++ b/hipamd/rocclr/hip_event.cpp @@ -140,6 +140,19 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) { void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) { amd::ScopedLock lock(lock_); + if (queue->properties().test(CL_QUEUE_PROFILING_ENABLE)) { + if (command == nullptr) { + command = queue->getLastQueuedCommand(true); + if (command == nullptr) { + command = new amd::Marker(*queue, kMarkerDisableFlush); + command->enqueue(); + } + } + } else if (command == nullptr) { + command = new hip::ProfileMarker(*queue, false); + command->enqueue(); + } + if (event_ == &command->event()) return; if (event_ != nullptr) { @@ -239,16 +252,9 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { } hip::Event* e = reinterpret_cast(event); - amd::ScopedLock lock(e->lock()); - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = queue->getLastQueuedCommand(true); - if (command == nullptr) { - command = new amd::Marker(*queue, kMarkerDisableFlush); - command->enqueue(); - } - e->addMarker(queue, command, true); + e->addMarker(queue, nullptr, true); HIP_RETURN(hipSuccess); } diff --git a/hipamd/rocclr/hip_event.hpp b/hipamd/rocclr/hip_event.hpp index dbd43b1a5c..dccd4e884a 100644 --- a/hipamd/rocclr/hip_event.hpp +++ b/hipamd/rocclr/hip_event.hpp @@ -26,12 +26,13 @@ namespace hip { -class TimerMarker: public amd::Marker { +class ProfileMarker: public amd::Marker { public: - TimerMarker(amd::HostQueue& queue) : amd::Marker(queue, false) { + ProfileMarker(amd::HostQueue& queue, bool disableFlush) + : amd::Marker(queue, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.callback_ = nullptr; - profilingInfo_.start_ = profilingInfo_.end_ = 0; + profilingInfo_.clear(); } }; diff --git a/hipamd/rocclr/hip_global.cpp b/hipamd/rocclr/hip_global.cpp index 2a395fee96..680d16b03f 100755 --- a/hipamd/rocclr/hip_global.cpp +++ b/hipamd/rocclr/hip_global.cpp @@ -23,16 +23,18 @@ DeviceVar::DeviceVar(std::string name, hipModule_t hmod) : shadowVptr(nullptr), guarantee(false); } - if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) { - DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str()); - guarantee(false); + // Handle size 0 symbols + if (size_ != 0) { + if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) { + DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str()); + guarantee(false); + } + amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_); } - - amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_); } DeviceVar::~DeviceVar() { - if (device_ptr_ != nullptr) { + if (amd_mem_obj_ != nullptr) { amd::MemObjMap::RemoveMemObj(device_ptr_); amd_mem_obj_->release(); } diff --git a/hipamd/rocclr/hip_hcc.def.in b/hipamd/rocclr/hip_hcc.def.in index da43f5f51d..253352f2cb 100755 --- a/hipamd/rocclr/hip_hcc.def.in +++ b/hipamd/rocclr/hip_hcc.def.in @@ -29,6 +29,7 @@ hipDeviceGetLimit hipDeviceGetName hipDeviceGetPCIBusId hipDeviceGetSharedMemConfig +hipDeviceGetP2PAttribute hipDevicePrimaryCtxGetState hipDevicePrimaryCtxRelease hipDevicePrimaryCtxReset @@ -175,8 +176,6 @@ __hipRegisterVar __hipRegisterSurface __hipRegisterTexture __hipUnregisterFatBinary -__gnu_h2f_ieee -__gnu_f2h_ieee hipConfigureCall hipSetupArgument hipLaunchByPtr diff --git a/hipamd/rocclr/hip_hcc.map.in b/hipamd/rocclr/hip_hcc.map.in index e66d4be92d..be83e6d134 100755 --- a/hipamd/rocclr/hip_hcc.map.in +++ b/hipamd/rocclr/hip_hcc.map.in @@ -30,6 +30,7 @@ global: hipDeviceGetName; hipDeviceGetPCIBusId; hipDeviceGetSharedMemConfig; + hipDeviceGetP2PAttribute; hipDevicePrimaryCtxGetState; hipDevicePrimaryCtxRelease; hipDevicePrimaryCtxReset; @@ -175,8 +176,6 @@ global: __hipRegisterSurface; __hipRegisterTexture; __hipUnregisterFatBinary; - __gnu_h2f_ieee; - __gnu_f2h_ieee; hipConfigureCall; hipSetupArgument; hipLaunchByPtr; diff --git a/hipamd/rocclr/hip_hmm.cpp b/hipamd/rocclr/hip_hmm.cpp index ab787e195e..82d16b3562 100644 --- a/hipamd/rocclr/hip_hmm.cpp +++ b/hipamd/rocclr/hip_hmm.cpp @@ -59,7 +59,7 @@ static_assert(static_cast(hipMemRangeAttributeLastPrefetchLocation) == hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) { HIP_INIT_API(hipMallocManaged, dev_ptr, size, flags); - if ((dev_ptr == nullptr) || (flags != hipMemAttachGlobal)) { + if ((dev_ptr == nullptr) || (size == 0) || (flags != hipMemAttachGlobal)) { HIP_RETURN(hipErrorInvalidValue); } @@ -71,7 +71,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, hipStream_t stream) { HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream); - if ((dev_ptr == nullptr) || (count == 0) || (stream == nullptr)) { + if ((dev_ptr == nullptr) || (count == 0)) { HIP_RETURN(hipErrorInvalidValue); } amd::HostQueue* queue = nullptr; @@ -213,4 +213,4 @@ static hipError_t ihipMallocManaged(void** ptr, size_t size) { ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] ihipMallocManaged ptr=0x%zx", getpid(), std::this_thread::get_id(), *ptr); return hipSuccess; -} \ No newline at end of file +} diff --git a/hipamd/rocclr/hip_internal.hpp b/hipamd/rocclr/hip_internal.hpp index 6dbf77baed..a950961ea7 100755 --- a/hipamd/rocclr/hip_internal.hpp +++ b/hipamd/rocclr/hip_internal.hpp @@ -37,6 +37,15 @@ #include #endif +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + /*! IHIP IPC MEMORY Structure */ #define IHIP_IPC_MEM_HANDLE_SIZE 32 #define IHIP_IPC_MEM_RESERVED_SIZE LP64_SWITCH(28,24) @@ -58,8 +67,8 @@ typedef struct ihipIpcMemHandle_st { } #define HIP_API_PRINT(...) \ - ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s ( %s )", getpid(), std::this_thread::get_id(), \ - __func__, ToString( __VA_ARGS__ ).c_str()); + uint64_t startTimeUs=0 ; HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s%s ( %s )%s", getpid(), std::this_thread::get_id(), KGRN, \ + __func__, ToString( __VA_ARGS__ ).c_str(),KNRM); #define HIP_ERROR_PRINT(err, ...) \ ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(), \ @@ -75,6 +84,12 @@ typedef struct ihipIpcMemHandle_st { HIP_INIT() \ HIP_CB_SPAWNER_OBJECT(cid); +#define HIP_RETURN_DURATION(ret, ...) \ + hip::g_lastError = ret; \ + HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(), \ + __func__, hipGetErrorName(hip::g_lastError), ToString( __VA_ARGS__ ).c_str()); \ + return hip::g_lastError; + #define HIP_RETURN(ret, ...) \ hip::g_lastError = ret; \ HIP_ERROR_PRINT(hip::g_lastError, __VA_ARGS__) \ diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index 553b59a77c..a24c8f93df 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -122,6 +122,9 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) *ptr = amd::SvmBuffer::malloc(*amdContext, flags, sizeBytes, amdContext->devices()[0]->info().memBaseAddrAlign_, useHostDevice ? curDevContext->svmDevices()[0] : nullptr); if (*ptr == nullptr) { + size_t free = 0, total =0; + hipMemGetInfo(&free, &total); + LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total); return hipErrorOutOfMemory; } @@ -178,9 +181,12 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin *srcMemory->asBuffer(), sOffset, sizeBytes, dst); isAsync = false; } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { - // Check if the queue device doesn't match the device on any memory object. Hence - // it's a P2P transfer, because the app has requested access to another GPU - if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { + // Check if the queue device doesn't match the device on any memory object. + // And any of them are not host allocation. + // Hence it's a P2P transfer, because the app has requested access to another GPU + if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && + ((srcMemory->getContext().devices().size() == 1) && + (dstMemory->getContext().devices().size() == 1))) { command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); if (command == nullptr) { @@ -193,7 +199,16 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin return hipErrorInvalidValue; } } else { - command = new amd::CopyMemoryCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, + amd::HostQueue* pQueue = &queue; + if (queueDevice != srcMemory->getContext().devices()[0]) { + pQueue = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = queue.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } + + command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); } } @@ -228,7 +243,7 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag hipError_t hipMalloc(void** ptr, size_t sizeBytes) { HIP_INIT_API(hipMalloc, ptr, sizeBytes); - HIP_RETURN(ihipMalloc(ptr, sizeBytes, 0), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, 0), *ptr); } hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { @@ -260,7 +275,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { ihipFlags |= CL_MEM_FOLLOW_USER_NUMA_POLICY; } - HIP_RETURN(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr); } hipError_t hipFree(void* ptr) { @@ -273,7 +288,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind); amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue)); } hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, @@ -282,7 +297,7 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, amd::HostQueue* queue = hip::getQueue(stream); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); } hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) { @@ -706,7 +721,7 @@ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) amd::MemObjMap::AddMemObj(hostPtr, mem); HIP_RETURN(hipSuccess); } else { - HIP_RETURN(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr); + HIP_RETURN_DURATION(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr); } } @@ -725,9 +740,14 @@ hipError_t hipHostUnregister(void* hostPtr) { amd::Memory* mem = getMemoryObject(hostPtr, offset); if(mem) { - for (const auto& device: hip::getCurrentDevice()->devices()) { - const device::Memory* devMem = mem->getDeviceMemory(*device); - amd::MemObjMap::RemoveMemObj(reinterpret_cast(devMem->virtualAddress())); + for (const auto& device: g_devices) { + const device::Memory* devMem = mem->getDeviceMemory(*device->devices()[0]); + if (devMem != nullptr) { + void* vAddr = reinterpret_cast(devMem->virtualAddress()); + if (amd::MemObjMap::FindMemObj(vAddr)) { + amd::MemObjMap::RemoveMemObj(vAddr); + } + } } amd::MemObjMap::RemoveMemObj(hostPtr); mem->release(); @@ -764,7 +784,7 @@ hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeByt device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpy(device_ptr, src, sizeBytes, kind)); + HIP_RETURN_DURATION(hipMemcpy(device_ptr, src, sizeBytes, kind)); } hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes, @@ -786,7 +806,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes, device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpy(dst, device_ptr, sizeBytes, kind)); + HIP_RETURN_DURATION(hipMemcpy(dst, device_ptr, sizeBytes, kind)); } hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, @@ -808,7 +828,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t si device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream)); + HIP_RETURN_DURATION(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream)); } hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBytes, @@ -830,7 +850,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBy device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream)); + HIP_RETURN_DURATION(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream)); } hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, @@ -838,7 +858,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount); - HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr))); } hipError_t hipMemcpyDtoH(void* dstHost, @@ -846,7 +866,7 @@ hipError_t hipMemcpyDtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr))); } hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, @@ -854,7 +874,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr))); } hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, @@ -863,7 +883,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, amd::HostQueue* queue = hip::getQueue(stream); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true)); } hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, @@ -872,7 +892,7 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, hipStream_t stream) { HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true)); } hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, @@ -881,7 +901,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipStream_t stream) { HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true)); } hipError_t hipMemcpyDtoHAsync(void* dstHost, @@ -890,7 +910,7 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost, hipStream_t stream) { HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true)); } hipError_t ihipMemcpyAtoD(hipArray* srcArray, @@ -1399,10 +1419,34 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipMemoryType srcMemoryType = pCopy->srcMemoryType; if (srcMemoryType == hipMemoryTypeUnified) { srcMemoryType = amd::MemObjMap::FindMemObj(pCopy->srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeHost) { + // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system memory. + const_cast(pCopy)->srcHost = pCopy->srcDevice; + } } hipMemoryType dstMemoryType = pCopy->dstMemoryType; if (dstMemoryType == hipMemoryTypeUnified) { dstMemoryType = amd::MemObjMap::FindMemObj(pCopy->dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeHost) { + const_cast(pCopy)->dstHost = pCopy->dstDevice; + } + } + + // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned. + // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. + if (srcMemoryType == hipMemoryTypeHost) { + amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->srcHost); + srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeDevice) { + const_cast(pCopy)->srcDevice = const_cast(pCopy->srcHost); + } + } + if (dstMemoryType == hipMemoryTypeHost) { + amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->dstHost); + dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (dstMemoryType == hipMemoryTypeDevice) { + const_cast(pCopy)->dstDevice = const_cast(pCopy->dstDevice); + } } amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ}; @@ -1480,21 +1524,21 @@ hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { HIP_INIT_API(hipMemcpyParam2D, pCopy); - HIP_RETURN(ihipMemcpyParam2D(pCopy, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyParam2D(pCopy, nullptr)); } hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr)); } hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DAsync, dst, dpitch, src, spitch, width, height, kind, stream); - HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true)); } hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) { @@ -1525,7 +1569,7 @@ hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, c hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr)); } hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { @@ -1540,7 +1584,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const const size_t height = (count / dst->width) / hip::getElementSize(dst); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr)); } hipError_t ihipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) { @@ -1580,7 +1624,7 @@ hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t src, size_t wOffsetSrc const size_t height = (count / src->width) / hip::getElementSize(src); - HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr)); } hipError_t hipMemcpyHtoA(hipArray* dstArray, @@ -1589,7 +1633,7 @@ hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoA, dstArray, dstOffset, srcHost, ByteCount); - HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyAtoH(void* dstHost, @@ -1598,7 +1642,7 @@ hipError_t hipMemcpyAtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoH, dstHost, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, @@ -1625,25 +1669,25 @@ hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipError_t hipMemcpy3D(const hipMemcpy3DParms* p) { HIP_INIT_API(hipMemcpy3D, p); - HIP_RETURN(ihipMemcpy3D(p, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy3D(p, nullptr)); } hipError_t hipMemcpy3DAsync(const hipMemcpy3DParms* p, hipStream_t stream) { HIP_INIT_API(hipMemcpy3DAsync, p, stream); - HIP_RETURN(ihipMemcpy3D(p, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy3D(p, stream, true)); } hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) { HIP_INIT_API(hipDrvMemcpy3D, pCopy); - HIP_RETURN(ihipMemcpyParam3D(pCopy, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, nullptr)); } hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) { HIP_INIT_API(hipDrvMemcpy3DAsync, pCopy, stream); - HIP_RETURN(ihipMemcpyParam3D(pCopy, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, stream, true)); } hipError_t packFillMemoryCommand(amd::Memory* memory, size_t offset, int64_t value, size_t valueSize, @@ -1892,7 +1936,7 @@ hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInB hipError_t hipMemAllocHost(void** ptr, size_t size) { HIP_INIT_API(hipMemAllocHost, ptr, size); - HIP_RETURN(hipHostMalloc(ptr, size, 0)); + HIP_RETURN_DURATION(hipHostMalloc(ptr, size, 0)); } hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* dev_ptr) { @@ -2074,25 +2118,25 @@ hipError_t ihipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hO hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind); - HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); } hipError_t hipMemcpyArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpyArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind); - HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); } hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind); - HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr)); } hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DFromArrayAsync, dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream); - HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true)); } hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, hipMemcpyKind kind, hipStream_t stream) { @@ -2107,13 +2151,13 @@ hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffs const size_t height = (count / src->width) / hip::getElementSize(src); - HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true)); } hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DToArrayAsync, dst, wOffset, hOffset, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true)); } hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind, hipStream_t stream) { @@ -2128,7 +2172,7 @@ hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const size_t height = (count / dst->width) / hip::getElementSize(dst); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true)); } hipError_t hipMemcpyAtoA(hipArray* dstArray, @@ -2138,7 +2182,7 @@ hipError_t hipMemcpyAtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoA, dstArray, dstOffset, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr)); } hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, @@ -2147,7 +2191,7 @@ hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoD, dstDevice, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyAtoHAsync(void* dstHost, @@ -2157,7 +2201,7 @@ hipError_t hipMemcpyAtoHAsync(void* dstHost, hipStream_t stream) { HIP_INIT_API(hipMemcpyAtoHAsync, dstHost, srcArray, srcOffset, ByteCount, stream); - HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); } hipError_t hipMemcpyDtoA(hipArray* dstArray, @@ -2166,7 +2210,7 @@ hipError_t hipMemcpyDtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoA, dstArray, dstOffset, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyHtoAAsync(hipArray* dstArray, @@ -2176,7 +2220,7 @@ hipError_t hipMemcpyHtoAAsync(hipArray* dstArray, hipStream_t stream) { HIP_INIT_API(hipMemcpyHtoAAsync, dstArray, dstOffset, srcHost, ByteCount, stream); - HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); } hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle, @@ -2233,7 +2277,7 @@ hipError_t hipMallocHost(void** ptr, HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr); } hipError_t hipFreeHost(void *ptr) { diff --git a/hipamd/rocclr/hip_module.cpp b/hipamd/rocclr/hip_module.cpp index 70dbc02f3a..07eeb55a84 100755 --- a/hipamd/rocclr/hip_module.cpp +++ b/hipamd/rocclr/hip_module.cpp @@ -193,16 +193,17 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) HIP_RETURN(hipSuccess); } -hipError_t ihipModuleLaunchKernel(hipFunction_t f, - uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, +hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { - HIP_INIT_API(ihipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, - sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params); + HIP_INIT_API(ihipModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, + blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, + stopEvent, flags, params); hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f); amd::Kernel* kernel = function->kernel(); @@ -229,7 +230,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, int block_size = blockDimX * blockDimY * blockDimZ; hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, block_size, sharedMemBytes, true); - if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(max_blocks_per_grid)) { + if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) > + unsigned(max_blocks_per_grid)) { return hipErrorCooperativeLaunchTooLarge; } } @@ -243,11 +245,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, } size_t globalWorkOffset[3] = {0}; - size_t globalWorkSize[3] = { gridDimX, gridDimY, gridDimZ }; + size_t globalWorkSize[3] = { globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ }; size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ }; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; - + bool profileNDRange = false; address kernargs = nullptr; // 'extra' is a struct that contains the following info: { @@ -271,13 +273,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, desc.type_ == T_POINTER/*svmBound*/); } else { assert(extra == nullptr); - kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/); + kernel->parameters().set(i, desc.size_, kernelParams[i], + desc.type_ == T_POINTER/*svmBound*/); } } + profileNDRange = (startEvent != nullptr && stopEvent != nullptr); + amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( *queue, waitList, *kernel, ndrange, sharedMemBytes, - params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); + params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange); if (!command) { return hipErrorOutOfMemory; } @@ -290,11 +295,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, command->enqueue(); - if(startEvent != nullptr) { + if (startEvent != nullptr) { eStart->addMarker(queue, command, false); command->retain(); } - if(stopEvent != nullptr) { + if (stopEvent != nullptr) { eStop->addMarker(queue, command, false); command->retain(); } @@ -313,8 +318,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); - - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX * blockDimX, gridDimY * blockDimY, gridDimZ * blockDimZ, + size_t globalWorkSizeX = gridDimX * blockDimX; + size_t globalWorkSizeY = gridDimY * blockDimY; + size_t globalWorkSizeZ = gridDimZ * blockDimZ; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(f, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr)); } @@ -337,37 +351,37 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, -hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, - uint32_t gridDimY, uint32_t gridDimZ, +hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(hipHccModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } -hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX, - uint32_t gridDimY, uint32_t gridDimZ, +hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(hipModuleLaunchKernelExt, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipModuleLaunchKernelExt, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } @@ -406,8 +420,17 @@ hipError_t hipLaunchCooperativeKernel(const void* f, int deviceId = ihipGetDevice(); hipFunction_t func = nullptr; HIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, f, deviceId)); - - HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z, + size_t globalWorkSizeX = gridDim.x * blockDim.x; + size_t globalWorkSizeY = gridDim.y * blockDim.y; + size_t globalWorkSizeZ = gridDim.z * blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups)); @@ -452,7 +475,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint64_t prevGridSize = 0; uint32_t firstDevice = 0; - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = @@ -481,11 +504,16 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL result = hipErrorInvalidDeviceFunction; HIP_RETURN(result); } - - result = ihipModuleLaunchKernel(func, - launch.gridDim.x * launch.blockDim.x, - launch.gridDim.y * launch.blockDim.y, - launch.gridDim.z * launch.blockDim.z, + size_t globalWorkSizeX = launch.gridDim.x * launch.blockDim.x; + size_t globalWorkSizeY = launch.gridDim.y * launch.blockDim.y; + size_t globalWorkSizeZ = launch.gridDim.z * launch.blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + result = ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), static_cast(globalWorkSizeZ), launch.blockDim.x, launch.blockDim.y, launch.blockDim.z, launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr, flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice); @@ -495,7 +523,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = diff --git a/hipamd/rocclr/hip_peer.cpp b/hipamd/rocclr/hip_peer.cpp index 225361d525..24207b52c6 100755 --- a/hipamd/rocclr/hip_peer.cpp +++ b/hipamd/rocclr/hip_peer.cpp @@ -52,24 +52,61 @@ hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){ amd::Device* device = nullptr; amd::Device* peer_device = nullptr; if (canAccessPeer == nullptr) { - HIP_RETURN(hipErrorInvalidValue); + return hipErrorInvalidValue; } /* Peer cannot be self */ if (deviceId == peerDeviceId) { *canAccessPeer = 0; - HIP_RETURN(hipSuccess); + return hipSuccess; } /* Cannot exceed the max number of devices */ if (static_cast(deviceId) >= g_devices.size() || static_cast(peerDeviceId) >= g_devices.size()) { - HIP_RETURN(hipErrorInvalidDevice); + return hipErrorInvalidDevice; } device = g_devices[deviceId]->devices()[0]; peer_device = g_devices[peerDeviceId]->devices()[0]; *canAccessPeer = static_cast(std::find(device->p2pDevices_.begin(), device->p2pDevices_.end(), as_cl(peer_device)) != device->p2pDevices_.end()); - HIP_RETURN(hipSuccess); + return hipSuccess; +} + +hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, + int srcDevice, int dstDevice) { + HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice); + + hipError_t hip_error = hipSuccess; + + if (value == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + if (srcDevice >= static_cast(g_devices.size()) + || dstDevice >= static_cast(g_devices.size())) { + HIP_RETURN(hipErrorInvalidDevice); + } + + switch (attr) { + case hipDevP2PAttrPerformanceRank : + assert(0 && "Unimplemented"); + break; + case hipDevP2PAttrAccessSupported : + hip_error = canAccessPeer(value, srcDevice, dstDevice); + break; + case hipDevP2PAttrNativeAtomicSupported : + assert(0 && "Unimplemented"); + break; + case hipDevP2PAttrHipArrayAccessSupported : + assert(0 && "Unimplemented"); + break; + default : + DevLogPrintfError("Invalid attribute attr: %d ", attr); + hip_error = hipErrorInvalidValue; + break; + } + + HIP_RETURN(hip_error); } hipError_t hipDeviceCanAccessPeer(int* canAccess, int deviceId, int peerDeviceId) { diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index 4d8e28dce7..3935e4d5f8 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -271,7 +271,7 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) { HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size)); - HIP_RETURN(hipSuccess); + HIP_RETURN(hipSuccess, *devPtr); } hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) { @@ -280,7 +280,7 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) { hipDeviceptr_t device_ptr = nullptr; HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr)); - HIP_RETURN(hipSuccess); + HIP_RETURN(hipSuccess, *sizePtr); } hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj, @@ -351,10 +351,10 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t GprWaves = VgprWaves; if (wrkGrpInfo->usedSGPRs_ > 0) { size_t maxSGPRs; - if (device.info().gfxipVersion_ < 800) { + if (device.info().gfxipMajor_ < 8) { maxSGPRs = 512; } - else if (device.info().gfxipVersion_ < 1000) { + else if (device.info().gfxipMajor_ < 10) { maxSGPRs = 800; } else { @@ -467,7 +467,7 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* HIP_RETURN(ret); } -hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) { HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynSharedMemPerBlk); @@ -486,7 +486,7 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, } hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, - hipFunction_t f, int blockSize, + hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynSharedMemPerBlk, flags); @@ -561,202 +561,6 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, namespace hip_impl { -struct dl_phdr_info { - ELFIO::Elf64_Addr dlpi_addr; - const char *dlpi_name; - const ELFIO::Elf64_Phdr *dlpi_phdr; - ELFIO::Elf64_Half dlpi_phnum; -}; - -extern "C" int dl_iterate_phdr( - int (*callback) (struct dl_phdr_info *info, size_t size, void *data), void *data -); - -struct Symbol { - std::string name; - ELFIO::Elf64_Addr value = 0; - ELFIO::Elf_Xword size = 0; - ELFIO::Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; -}; - -inline Symbol read_symbol(const ELFIO::symbol_section_accessor& section, unsigned int idx) { - assert(idx < section.get_symbols_num()); - - Symbol r; - section.get_symbol(idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); - - return r; -} - -template -inline ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) { - const auto it = find_if(reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; -} - -std::vector> function_names_for(const ELFIO::elfio& reader, - ELFIO::section* symtab) { - std::vector> r; - ELFIO::symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - auto tmp = read_symbol(symbols, i); - - if (tmp.type == STT_FUNC && tmp.sect_idx != SHN_UNDEF && !tmp.name.empty()) { - r.emplace_back(tmp.value, tmp.name); - } - } - - return r; -} - -const std::vector>& function_names_for_process() { - static constexpr const char self[] = "/proc/self/exe"; - - static std::vector> r; - static std::once_flag f; - - std::call_once(f, []() { - ELFIO::elfio reader; - - if (reader.load(self)) { - const auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; }); - - if (it) r = function_names_for(reader, it); - } - }); - - return r; -} - - -const std::unordered_map& function_names() -{ - static std::unordered_map r{ - function_names_for_process().cbegin(), - function_names_for_process().cend()}; - static std::once_flag f; - - std::call_once(f, []() { - dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { - ELFIO::elfio reader; - - if (reader.load(info->dlpi_name)) { - const auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; }); - - if (it) { - auto n = function_names_for(reader, it); - - for (auto&& f : n) f.first += info->dlpi_addr; - - r.insert(make_move_iterator(n.begin()), make_move_iterator(n.end())); - } - } - return 0; - }, - nullptr); - }); - - return r; -} - -std::vector bundles_for_process() { - static constexpr const char self[] = "/proc/self/exe"; - static constexpr const char kernel_section[] = ".kernel"; - std::vector r; - - ELFIO::elfio reader; - - if (reader.load(self)) { - auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_name() == kernel_section; }); - - if (it) r.insert(r.end(), it->get_data(), it->get_data() + it->get_size()); - } - - return r; -} - -const std::vector& modules() { - static std::vector r; - static std::once_flag f; - - std::call_once(f, []() { - static std::vector> bundles{bundles_for_process()}; - - dl_iterate_phdr( - [](dl_phdr_info* info, std::size_t, void*) { - ELFIO::elfio tmp; - if (tmp.load(info->dlpi_name)) { - const auto it = find_section_if( - tmp, [](const ELFIO::section* x) { return x->get_name() == ".kernel"; }); - - if (it) bundles.emplace_back(it->get_data(), it->get_data() + it->get_size()); - } - return 0; - }, - nullptr); - - for (auto&& bundle : bundles) { - if (bundle.empty()) { - continue; - } - std::string magic(&bundle[0], sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) - 1); - if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) - continue; - - const auto obheader = reinterpret_cast(&bundle[0]); - const auto* desc = &obheader->desc[0]; - for (uint64_t i = 0; i < obheader->numBundles; ++i, - desc = reinterpret_cast( - reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { - - std::string triple(desc->triple, sizeof(HCC_AMDGCN_AMDHSA_TRIPLE) - 1); - if (triple.compare(HCC_AMDGCN_AMDHSA_TRIPLE)) - continue; - - std::string target(desc->triple + sizeof(HCC_AMDGCN_AMDHSA_TRIPLE), - desc->tripleSize - sizeof(HCC_AMDGCN_AMDHSA_TRIPLE)); - - if (isCompatibleCodeObject(target, hip::getCurrentDevice()->devices()[0]->info().name_)) { - hipModule_t module; - if (hipSuccess == hipModuleLoadData(&module, reinterpret_cast( - reinterpret_cast(obheader) + desc->offset))) - r.push_back(module); - break; - } - } - } - }); - - return r; -} - -const std::unordered_map& functions() -{ - static std::unordered_map r; - static std::once_flag f; - - std::call_once(f, []() { - for (auto&& function : function_names()) { - for (auto&& module : modules()) { - hipFunction_t f; - if (hipSuccess == hipModuleGetFunction(&f, module, function.second.c_str())) { - r[function.first] = f; - } - } - } - }); - - return r; -} - void hipLaunchKernelGGLImpl( uintptr_t function_address, const dim3& numBlocks, @@ -767,11 +571,19 @@ void hipLaunchKernelGGLImpl( { HIP_INIT(); - const auto it = functions().find(function_address); - if (it == functions().cend()) - assert(0); + hip::Stream* s = reinterpret_cast(stream); + int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice(); + if (deviceId == -1) { + DevLogPrintfError("Wrong Device Id: %d \n", deviceId); + } - hipModuleLaunchKernel(it->second, + hipFunction_t func = nullptr; + hipError_t hip_error = PlatformState::instance().getStatFunc(&func, reinterpret_cast(function_address), deviceId); + if ((hip_error != hipSuccess) || (func == nullptr)) { + DevLogPrintfError("Cannot find the static function: 0x%x", function_address); + } + + hipModuleLaunchKernel(func, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg); @@ -815,63 +627,24 @@ hipError_t ihipLaunchKernel(const void* hostFunction, hipFunction_t func = nullptr; hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId); if ((hip_error != hipSuccess) || (func == nullptr)) { -#ifdef ATI_OS_LINUX - const auto it = hip_impl::functions().find(reinterpret_cast(hostFunction)); - if (it == hip_impl::functions().cend()) { - DevLogPrintfError("Cannot find function: 0x%x \n", hostFunction); - HIP_RETURN(hipErrorInvalidDeviceFunction); - } - func = it->second; -#else HIP_RETURN(hipErrorInvalidDeviceFunction); -#endif } - HIP_RETURN(ihipModuleLaunchKernel(func, (gridDim.x * blockDim.x), (gridDim.y * blockDim.y), - (gridDim.z * blockDim.z), blockDim.x, blockDim.y, blockDim.z, + size_t globalWorkSizeX = gridDim.x * blockDim.x; + size_t globalWorkSizeY = gridDim.y * blockDim.y; + size_t globalWorkSizeZ = gridDim.z * blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), + blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, stream, args, nullptr, startEvent, stopEvent, flags)); } -// conversion routines between float and half precision -static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; } -static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; } -static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); } - -// half float, the f16 is in the low 16 bits of the input argument -static inline float __convert_half_to_float(std::uint32_t a) noexcept { - std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; - std::uint32_t v = f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U)/*0x1.0p+112f*/) + 0x38000000U; - u = (a & 0x7fff) != 0 ? v : u; - return u32_as_f32(u) * u32_as_f32(0x07800000U)/*0x1.0p-112f*/; -} - -// float half with nearest even rounding -// The lower 16 bits of the result is the bit pattern for the f16 -static inline std::uint32_t __convert_float_to_half(float a) noexcept { - std::uint32_t u = f32_as_u32(a); - int e = static_cast((u >> 23) & 0xff) - 127 + 15; - std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0); - std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0); - std::uint32_t n = ((std::uint32_t)e << 12) | m; - std::uint32_t s = (u >> 16) & 0x8000; - int b = clamp_int(1-e, 0, 13); - std::uint32_t d = (0x1000 | m) >> b; - d |= (d << b) != (0x1000 | m); - std::uint32_t v = e < 1 ? d : n; - v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5)); - v = e > 30 ? 0x7c00 : v; - v = e == 143 ? i : v; - return s | v; -} - -extern "C" float __gnu_h2f_ieee(unsigned short h){ - return __convert_half_to_float((std::uint32_t) h); -} - -extern "C" unsigned short __gnu_f2h_ieee(float f){ - return (unsigned short)__convert_float_to_half(f); -} - void PlatformState::init() { amd::ScopedLock lock(lock_); @@ -1074,4 +847,3 @@ void PlatformState::popExec(ihipExec_t& exec) { exec = std::move(execStack_.top()); execStack_.pop(); } - diff --git a/hipamd/rocclr/hip_stream.cpp b/hipamd/rocclr/hip_stream.cpp index 8d3d3e4340..379954ef5c 100755 --- a/hipamd/rocclr/hip_stream.cpp +++ b/hipamd/rocclr/hip_stream.cpp @@ -22,6 +22,9 @@ #include "hip_internal.hpp" #include "hip_event.hpp" #include "thread/monitor.hpp" +#include "hip_prof_api.h" + +extern api_callbacks_table_t callbacks_table; static amd::Monitor streamSetLock{"Guards global stream set"}; static std::unordered_set streamSet; @@ -50,7 +53,12 @@ Stream::Stream(hip::Device* dev, Priority p, // ================================================================================================ bool Stream::Create() { - cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + // Enable queue profiling if a profiler is attached which sets the callback_table flag + // or if we force it with env var. This would enable time stamp collection for every + // command submitted to the stream(queue). + cl_command_queue_properties properties = (callbacks_table.is_enabled() || + HIP_FORCE_QUEUE_PROFILING) ? + CL_QUEUE_PROFILING_ENABLE : 0; amd::CommandQueue::Priority p; switch (priority_) { case Priority::High: @@ -64,8 +72,9 @@ bool Stream::Create() { p = amd::CommandQueue::Priority::Normal; break; } - amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], properties, - amd::CommandQueue::RealTimeDisabled, p, cuMask_); + amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], + properties, amd::CommandQueue::RealTimeDisabled, + p, cuMask_); // Create a host queue bool result = (queue != nullptr) ? queue->create() : false; @@ -202,6 +211,10 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) { HIP_INIT_API(hipStreamCreateWithFlags, stream, flags); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + HIP_RETURN(ihipStreamCreate(stream, flags, hip::Stream::Priority::Normal), *stream); } @@ -209,6 +222,10 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) { hipError_t hipStreamCreate(hipStream_t *stream) { HIP_INIT_API(hipStreamCreate, stream); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal), *stream); } @@ -216,6 +233,10 @@ hipError_t hipStreamCreate(hipStream_t *stream) { hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) { HIP_INIT_API(hipStreamCreateWithPriority, stream, flags, priority); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + hip::Stream::Priority streamPriority; if (priority <= hip::Stream::Priority::High) { streamPriority = hip::Stream::Priority::High; diff --git a/hipamd/rocclr/hip_texture.cpp b/hipamd/rocclr/hip_texture.cpp index 4980f87eaf..8217dfb6f8 100755 --- a/hipamd/rocclr/hip_texture.cpp +++ b/hipamd/rocclr/hip_texture.cpp @@ -269,8 +269,10 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, case hipResourceTypePitch2D: { const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB); const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode); + const amd::Image::Format imageFormat({channelOrder, channelType}); const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType); - const size_t imageSizeInBytes = pResDesc->res.pitch2D.pitchInBytes * pResDesc->res.pitch2D.height; + const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() + + pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1); amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes); image = ihipImageCreate(channelOrder, channelType, diff --git a/hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp b/hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp rename to hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp index 84ba73c3aa..12999ebc33 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp +++ b/hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp b/hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp rename to hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp index 5000904af9..3cb3243e80 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp b/hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp rename to hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp index 6f284ae7fb..d9a2d443a2 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp new file mode 100644 index 0000000000..181cd37f24 --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp @@ -0,0 +1,136 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) { + + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + uint tmp = 0; + for (size_t i = idx; i < N; i += stride) { + for (size_t j = 0; j < arraySize; j++) { + tmp += src[i].data[j]; + } + } + + atomicAdd(dst, tmp); +} + +int main(int argc, char* argv[]) { + d_uint16 *dSrc; + d_uint16 *hSrc; + uint *dDst; + uint *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + const unsigned threadsPerBlock = 64; + const unsigned blocks = props.multiProcessorCount * 4; + + uint inputData = 0x1; + int nIter = 1000; + + hSrc = new d_uint16[nBytes]; + HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); + hDst = new uint; + hDst[0] = 0; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (int j = 0; j < arraySize; j++) { + hSrc[i].data[j] = inputData; + } + } + + HIPCHECK(hipMalloc(&dSrc, nBytes)); + HIPCHECK(hipMalloc(&dDst, sizeof(uint))); + + HIPCHECK(hipStreamCreate(&stream)); + + HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + if (hDst[0] != (nBytes / sizeof(uint))) { + cout << "info: Data validation failed for warm up run!" << endl; + cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl; + HIPCHECK(hipErrorUnknown); + } + + // measure performance based on host time + auto all_start = chrono::steady_clock::now(); + + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB" << endl; + + delete [] hSrc; + delete hDst; + hipFree(dSrc); + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp new file mode 100644 index 0000000000..4d706cdde9 --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp @@ -0,0 +1,126 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) { + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < N; i += stride) { + dst[i] = pval; + } +}; + +int main(int argc, char* argv[]) { + d_uint16 *dDst; + d_uint16 *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + d_uint16 pval; + + for (int i = 0; i < arraySize; i++) { + pval.data[i] = 0xabababab; + } + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + size_t threadsPerBlock = 64; + size_t blocks = props.multiProcessorCount * 4; + + uint inputData = 0xabababab; + int nIter = 1000; + + hDst = new d_uint16[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < arraySize; j++) { + hDst[i].data[j] = 0; + } + } + + HIPCHECK(hipMalloc(&dDst, nBytes)); + + HIPCHECK(hipStreamCreate(&stream)); + + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + for (uint i = 0; i < N; i++) { + for (uint j = 0; j < arraySize; j++) { + if (hDst[i].data[j] != inputData) { + cout << "info: Data validation failed for warm up run! " << endl; + cout << "at index i: " << i << " element j: " << j << endl; + cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl; + HIPCHECK(hipErrorUnknown); + } + } + } + + auto all_start = chrono::steady_clock::now(); + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + } + hipDeviceSynchronize(); + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB" << endl; + + + delete [] hDst; + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp b/hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp similarity index 94% rename from hipamd/tests/performance/memory/hipHostNumaAlloc.cpp rename to hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp index 75d6edf0cf..a5e60c8549 100644 --- a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp +++ b/hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp @@ -34,12 +34,13 @@ THE SOFTWARE. #include #include "hip/hip_runtime.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: hipPerfHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ -// To run it correctly, we must not export HIP_VISIBLE_DEVICES +// To run it correctly, we must not export HIP_VISIBLE_DEVICES. +// And we must explicitly link libnuma because of numa api move_pages(). #define NUM_PAGES 4 char *h = nullptr; char *d_h = nullptr; @@ -127,6 +128,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { printf("\n"); HIPCHECK(hipHostFree((void* )h)); + hipHostUnregister(m); free(m); if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) { @@ -149,8 +151,7 @@ bool runTest(const int &cpuCount, const int &gpuCount, for (int i = 0; i < cpuCount; i++) { for (int j = 0; j < gpuCount; j++) { - if (!test(i, j, mode[m], - hipHostMallocDefault | hipHostMallocNumaUser)) { + if (!test(i, j, mode[m], hostMallocflags)) { return false; } } diff --git a/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp b/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp new file mode 100644 index 0000000000..86adc5f354 --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp @@ -0,0 +1,250 @@ +/* + Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define sharedMemSize1 2048 +#define sharedMemSize2 256 + +__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) { + + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t lid = threadIdx.x; + __shared__ float local[sharedMemSize1]; + + float val1 = 0; + float val2 = 0; + float val3 = 0; + float val4 = 0; + + for (int i = 0; i < (sharedMemSize1 / 64); i++) { + local[lid + i * 64] = lid; + } + + __syncthreads(); + + val1 += local[lid]; + val2 += local[lid + 64]; + val3 += local[lid + 128]; + val4 += local[lid + 192]; + val1 += local[lid + 256]; + val2 += local[lid + 320]; + val3 += local[lid + 384]; + val4 += local[lid + 448]; + val1 += local[lid + 512]; + val2 += local[lid + 576]; + val3 += local[lid + 640]; + val4 += local[lid + 704]; + val1 += local[lid + 768]; + val2 += local[lid + 832]; + val3 += local[lid + 896]; + val4 += local[lid + 960]; + val1 += local[lid + 1024]; + val2 += local[lid + 1088]; + val3 += local[lid + 1152]; + val4 += local[lid + 1216]; + val1 += local[lid + 1280]; + val2 += local[lid + 1344]; + val3 += local[lid + 1408]; + val4 += local[lid + 1472]; + val1 += local[lid + 1536]; + val2 += local[lid + 1600]; + val3 += local[lid + 1664]; + val4 += local[lid + 1728]; + val1 += local[lid + 1792]; + val2 += local[lid + 1856]; + val3 += local[lid + 1920]; + val4 += local[lid + 1984]; + + if (gid < N) { + outBuf[gid] = val1 + val2 + val3 + val4; + } +}; + +__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) { + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t lid = threadIdx.x; + __shared__ float local[sharedMemSize2]; + + float val0 = 0.0f; + float val1 = 0.0f; + + for (int i = 0; i < (sharedMemSize2 / 64); i++) { + local[lid + i * 64] = lid; + } + + __syncthreads(); + +#pragma nounroll + for (uint i = 0; i < 32; i++) { + val0 += local[8 * i + 0]; + val1 += local[8 * i + 1]; + val0 += local[8 * i + 2]; + val1 += local[8 * i + 3]; + val0 += local[8 * i + 4]; + val1 += local[8 * i + 5]; + val0 += local[8 * i + 6]; + val1 += local[8 * i + 7]; + } + + if (gid < N) { + outBuf[gid] = val0 + val1; + } +}; + +int main(int argc, char *argv[]) { + float *dDst; + float *hDst; + hipStream_t stream; + constexpr uint numSizes = 4; + constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216}; + uint numReads1 = 32; + uint numReads2 = 256; + uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float); + uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float); + int nIter = 1000; + const unsigned threadsPerBlock = 64; + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << endl; + + HIPCHECK(hipStreamCreate(&stream)); + + for (int nTest = 0; nTest < numSizes; nTest++) { + uint nBytes = Sizes[nTest % numSizes]; + ulong N = nBytes / sizeof(float); + const unsigned blocks = N / threadsPerBlock; + + hDst = new float[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + memset(hDst, 0, nBytes); + + HIPCHECK(hipMalloc(&dDst, nBytes)); + HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), + 0, stream, dDst, N); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + int tmp = 0; + for (int i = 0; i < N; i++) { + if (i % threadsPerBlock == 0) { + tmp = 0; + } + if (hDst[i] != tmp) { + cout << "info: Data validation failed for warm up run!" << endl; + cout << "info: expected " << tmp << " got " << hDst[i] << endl; + HIPCHECK (hipErrorUnknown); + } + tmp += threadsPerBlock / 2; + } + + auto all_start = chrono::steady_clock::now(); + for (int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), + dim3(threadsPerBlock), 0, stream, dDst, N); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double) blocks * threadsPerBlock + * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter + * (double) (1e-09)) / all_kernel_time.count(); + + cout << "info: read speed = " << setw(8) << perf << " GB/s for " + << sharedMemSizeBytes1 / 1024 << " KB shared memory" + " with " << setw(8) << blocks * threadsPerBlock << " threads, " + << setw(4) << numReads1 << " reads in sharedMemReadSpeed1 kernel" << endl; + + delete[] hDst; + hipFree(dDst); + } + + + for (int nTest = 0; nTest < numSizes; nTest++) { + uint nBytes = Sizes[nTest % numSizes]; + ulong N = nBytes / sizeof(float); + const unsigned blocks = N / threadsPerBlock; + + hDst = new float[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + memset(hDst, 0, nBytes); + + HIPCHECK(hipMalloc(&dDst, nBytes)); + HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), + 0, stream, dDst, N); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + auto all_start = chrono::steady_clock::now(); + for (int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), + dim3(threadsPerBlock), 0, stream, dDst, N); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double) blocks * threadsPerBlock + * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter + * (double) (1e-09)) / all_kernel_time.count(); + + cout << "info: read speed = " << setw(8) << perf << " GB/s for " + << sharedMemSizeBytes2 / 1024 << " KB shared memory" + " with " << setw(8) << blocks * threadsPerBlock << " threads, " + << setw(4) << numReads2 << " reads in sharedMemReadSpeed2 kernel" << endl; + + delete[] hDst; + hipFree(dDst); + } + + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/hipamd/tests/src/cg/hipCGThreadBlockType.cpp b/hipamd/tests/src/cg/hipCGThreadBlockType.cpp index ab9492c609..14c2e3ce2a 100644 --- a/hipamd/tests/src/cg/hipCGThreadBlockType.cpp +++ b/hipamd/tests/src/cg/hipCGThreadBlockType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp b/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp index e9e6c843cf..c9b100ccae 100644 --- a/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp +++ b/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp @@ -19,12 +19,13 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ #include "test_common.h" +#include #include "hip/hip_fp16.h" #define test_passed(test_name) \ diff --git a/hipamd/tests/src/g++/hipMalloc.cpp b/hipamd/tests/src/g++/hipMalloc.cpp index 3aab48aad5..d527db3570 100644 --- a/hipamd/tests/src/g++/hipMalloc.cpp +++ b/hipamd/tests/src/g++/hipMalloc.cpp @@ -18,7 +18,7 @@ * */ /* HIT_START - * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t EXCLUDE_HIP_PLATFORM nvcc * HIT_END */ diff --git a/hipamd/tests/src/gcc/LaunchKernel.c b/hipamd/tests/src/gcc/LaunchKernel.c index 1791d52d25..08aca3e2fe 100644 --- a/hipamd/tests/src/gcc/LaunchKernel.c +++ b/hipamd/tests/src/gcc/LaunchKernel.c @@ -21,7 +21,7 @@ /* HIT_START * BUILD_CMD: gpu.o %hc -I%hip-path/include -g -c %S/gpu.cpp -o %T/gpu.o EXCLUDE_HIP_PLATFORM nvcc rocclr * BUILD_CMD: launchkernel.o %hc -D__HIP_PLATFORM_HCC__ -g -I%hip-path/include -c %S/LaunchKernel.c -o %T/launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr - * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr + * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr * TEST: %t EXCLUDE_HIP_PLATFORM nvcc rocclr * HIT_END */ diff --git a/hipamd/tests/src/gcc/hipMalloc.c b/hipamd/tests/src/gcc/hipMalloc.c index f54071f907..2e5deb2a16 100644 --- a/hipamd/tests/src/gcc/hipMalloc.c +++ b/hipamd/tests/src/gcc/hipMalloc.c @@ -19,7 +19,7 @@ /* HIT_START * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_NVCC__ -I%hip-path/include -I/usr/local/cuda/include %S/%s -o %T/hipMalloc_nv -L/usr/local/cuda/lib64 -lcudart EXCLUDE_HIP_PLATFORM hcc rocclr - * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr + * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr * TEST: hipMalloc_nv EXCLUDE_HIP_PLATFORM hcc rocclr * TEST: hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr * HIT_END diff --git a/hipamd/tests/src/ipc/hipSimpleIpc.cpp b/hipamd/tests/src/ipc/hipSimpleIpc.cpp index 074c06692b..4caea82aab 100755 --- a/hipamd/tests/src/ipc/hipSimpleIpc.cpp +++ b/hipamd/tests/src/ipc/hipSimpleIpc.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ @@ -43,6 +43,7 @@ void single_process() { // Negative, Make sure we return error when an offset of original ptr is passed ipc_offset_dptr = ipc_dptr + (OFFSET * sizeof(int)); + // HIP API return value differs from CUDA's return type assert(hipErrorInvalidDevicePointer == hipIpcGetMemHandle(&ipc_offset_handle, ipc_offset_dptr)); // Get handle for the device_ptr diff --git a/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp b/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp index ae944ff122..874f8bc44c 100644 --- a/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp +++ b/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp @@ -26,7 +26,7 @@ /* HIT_START * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc * HIT_END */ @@ -106,8 +106,13 @@ bool compareHipDeviceGetPCIBusIdWithLspci() { getPciBusId(deviceCount, hipDeviceList); // Get lspci device list and compare with hip device list +#if defined(__CUDA_ARCH__) + char const *command = "lspci -D | grep controller | grep NVIDIA | " + "cut -d ' ' -f 1"; +#else char const *command = "lspci -D | grep controller | grep AMD/ATI | " "cut -d ' ' -f 1"; +#endif fpipe = popen(command, "r"); if (fpipe == nullptr) { diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp index aaaf8332bc..b3d613c45f 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp @@ -467,6 +467,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) { &A_h[0], &B_h[0], &C_h[0], N, false); for (int i=1; i < numDevices; ++i) { + HIPCHECK(hipSetDevice(i)); HIPCHECK(hipMalloc(&A_d[i], Nbytes)); HIPCHECK(hipMalloc(&B_d[i], Nbytes)); HIPCHECK(hipMalloc(&C_d[i], Nbytes)); @@ -476,6 +477,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) { hipStream_t stream[numDevices]; for (int i=0; i < numDevices; ++i) { + HIPCHECK(hipSetDevice(i)); HIPCHECK(hipStreamCreate(&stream[i])); } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp index f08b6c921f..9f5b9092f9 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp @@ -166,11 +166,11 @@ bool testhipMemset2AsyncOps() { hipStream_t s; hipStreamCreate(&s); hipMemsetAsync(p2, 0, 32*32*4, s); - hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s ); hipStreamSynchronize(s); for (int i = 0; i < 256; ++i) { hipMemsetAsync(p2, 0, 32*32*4, s); - hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s ); } hipStreamSynchronize(s); hipDeviceSynchronize(); diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp index ce78590147..cc976ced42 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp @@ -60,8 +60,6 @@ void run(const std::vector& buffer, int deviceNo) { hipSetDevice(deviceNo); hipModule_t Module; hipFunction_t Function; - HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); float *A, *B, *Ad, *Bd; A = new float[LEN]; @@ -78,6 +76,9 @@ void run(const std::vector& buffer, int deviceNo) { HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 6ae1b92ab3..840e9b6975 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -56,8 +56,6 @@ std::vector load_file() { void run(const std::vector& buffer) { hipModule_t Module; hipFunction_t Function; - HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); float *A, *B, *Ad, *Bd; A = new float[LEN]; @@ -74,6 +72,9 @@ void run(const std::vector& buffer) { HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); diff --git a/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp b/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp index 560f27e741..579714566d 100755 --- a/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp +++ b/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr + * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr nvcc * HIT_END */ diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp index 8c2fe2603b..d21ea5da54 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp @@ -81,6 +81,13 @@ static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status, sleep(SECONDS_TO_WAIT); } +bool rangedCompare(long a, long b) { + auto diff = b - a; + if (diff < 0) diff *= -1; + if (diff < 500) return true; + return false; +} + int main(int argc, char* argv[]) { float *A_d, *C_d; @@ -139,7 +146,8 @@ int main(int argc, char* argv[]) { // completes the execution. Therefore the hipStreamSynchronize() in the // main thread should hardly take any time to complete. - if (duration.count() < SECONDS_TO_WAIT * TO_MICROSECONDS) { + if ((duration.count() < (SECONDS_TO_WAIT * TO_MICROSECONDS)) || + (rangedCompare(duration.count(), SECONDS_TO_WAIT * TO_MICROSECONDS))) { passed(); } else { failed("hipStreamSynchronize is waiting untill Callback() completes."); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp index 11cd8d95b5..e7e3261d64 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all * TEST: %t * HIT_END */ diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp index d1c3de08fe..8da2c2f8a5 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp @@ -45,7 +45,7 @@ int main(int argc, char *argv[]) { // Check if priorities are indeed supported if ((priority_low + priority_high) != 0) { - failed("Priorities are not supported"); + passed(); // exit the test since priorities are not supported } // Checking Priority of default stream diff --git a/hipamd/tests/performance/perfDispatch/timer.cpp b/hipamd/tests/src/timer.cpp similarity index 100% rename from hipamd/tests/performance/perfDispatch/timer.cpp rename to hipamd/tests/src/timer.cpp diff --git a/hipamd/tests/performance/perfDispatch/timer.h b/hipamd/tests/src/timer.h similarity index 100% rename from hipamd/tests/performance/perfDispatch/timer.h rename to hipamd/tests/src/timer.h