diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index ce0eeb362d..1ba58496f4 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -189,7 +189,7 @@ if(HIP_PLATFORM STREQUAL "hcc") execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900") add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) target_link_libraries(hip_hcc PRIVATE hc_am) add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME}) diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index 7e15d6b2e6..d2822fd0da 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -23,8 +23,8 @@ use File::Basename; # HSA_PATH : Path to HSA dir (default /opt/rocm/hsa). Used on AMD platforms only. if(scalar @ARGV == 0){ -print "No Arguments passed, exiting ...\n"; -exit(-1); + print "No Arguments passed, exiting ...\n"; + exit(-1); } #--- @@ -74,6 +74,7 @@ $target_gfx701 = 0; $target_gfx801 = 0; $target_gfx802 = 0; $target_gfx803 = 0; +$target_gfx900 = 0; if ($HIP_PLATFORM eq "hcc") { $HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa"; @@ -189,18 +190,18 @@ if ($verbose & 0x4) { # Handle code object generation my $ISACMD=""; if($HIP_PLATFORM eq "hcc"){ - $ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh "; - if($ARGV[0] eq "--genco"){ - foreach $isaarg (@ARGV[1..$#ARGV]){ - $ISACMD .= " "; - $ISACMD .= $isaarg; + $ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh "; + if($ARGV[0] eq "--genco"){ + foreach $isaarg (@ARGV[1..$#ARGV]){ + $ISACMD .= " "; + $ISACMD .= $isaarg; + } + if ($verbose & 0x1) { + print "hipcc-cmd: ", $ISACMD, "\n"; + } + system($ISACMD) and die(); + exit(0); } - if ($verbose & 0x1) { - print "hipcc-cmd: ", $ISACMD, "\n"; - } - system($ISACMD) and die(); - exit(0); - } } if(($HIP_PLATFORM eq "hcc")){ @@ -210,18 +211,18 @@ if(($HIP_PLATFORM eq "hcc")){ } if($HIP_PLATFORM eq "nvcc"){ - $ISACMD .= "$HIP_PATH/bin/hipcc -ptx "; - if($ARGV[0] eq "--genco"){ - foreach $isaarg (@ARGV[1..$#ARGV]){ - $ISACMD .= " "; - $ISACMD .= $isaarg; + $ISACMD .= "$HIP_PATH/bin/hipcc -ptx "; + if($ARGV[0] eq "--genco"){ + foreach $isaarg (@ARGV[1..$#ARGV]){ + $ISACMD .= " "; + $ISACMD .= $isaarg; + } + if ($verbose & 0x1) { + print "hipcc-cmd: ", $ISACMD, "\n"; + } + system($ISACMD) and die(); + exit(0); } - if ($verbose & 0x1) { - print "hipcc-cmd: ", $ISACMD, "\n"; - } - system($ISACMD) and die(); - exit(0); - } } my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool @@ -247,20 +248,25 @@ foreach $arg (@ARGV) } if($arg eq '--amdgpu-target=gfx701') { - $target_gfx701 = 1; + $target_gfx701 = 1; } if($arg eq '--amdgpu-target=gfx801') { - $target_gfx801 = 1; + $target_gfx801 = 1; } if($arg eq '--amdgpu-target=gfx802') { - $target_gfx802 = 1; + $target_gfx802 = 1; } if($arg eq '--amdgpu-target=gfx803') { - $target_gfx803 = 1; + $target_gfx803 = 1; } + if($arg eq '--amdgpu-target=gfx900') + { + $target_gfx900 = 1; + } + if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0)) { $HIPCXXFLAGS .= $HCC_WA_FLAGS; @@ -320,6 +326,33 @@ foreach $arg (@ARGV) } $toolArgs .= " $arg" unless $swallowArg; } +foreach my $target (split(/,/, $ENV{HCC_AMDGPU_TARGET})) +{ + if($target eq 'gfx701') + { + $target_gfx701 = 1; + } + if($target eq 'gfx801') + { + $target_gfx801 = 1; + } + if($target eq 'gfx802') + { + $target_gfx802 = 1; + } + if($target eq 'gfx803') + { + $target_gfx803 = 1; + } + if($target eq 'gfx900') + { + $target_gfx900 = 1; + } +} +if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0 and $target_gfx900 eq 0) +{ + $target_gfx803 = 1; +} if($HIP_PLATFORM eq "hcc"){ @@ -343,12 +376,10 @@ if($HIP_PLATFORM eq "hcc"){ $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX803__=1 "; $ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; } - if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0) - { - $HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803"; - $ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; + if ($target_gfx900 eq 1) { + $HIPLDFLAGS .= " --amdgpu-target=gfx900"; + $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 "; } - } if ($hasC and $HIP_PLATFORM eq 'nvcc') { @@ -407,3 +438,5 @@ if ($runCmd) { } system ("$CMD") and die (); } + +# vim: ts=4:sw=4:expandtab:smartindent diff --git a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md index 0acdc246f9..9f20d12423 100644 --- a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md +++ b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md @@ -166,10 +166,10 @@ Both nvcc and hcc make two passes over the code: one for host code and one for d ``` // #ifdef __CUDA_ARCH__ -#ifdef __HIP_DEVICE_COMPILE__ && (__HIP_DEVICE_COMPILE__ == 1) +#if __HIP_DEVICE_COMPILE__ ``` -Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doesn’t represent the feature capability of the target device. +Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn’t represent the feature capability of the target device. ### Compiler Defines: Summary @@ -178,7 +178,7 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doe |HIP-related defines:| |`__HIP_PLATFORM_HCC___`| Defined | Undefined | Defined if targeting hcc platform; undefined otherwise | |`__HIP_PLATFORM_NVCC___`| Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise | -|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; 0 if compiling for host |1 if compiling for device; 0 if compiling for host | Undefined +|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined |`__HIPCC__` | Defined | Defined | Undefined |`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 |nvcc-related defines:| diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h index d4fea7f034..f50a601b90 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h @@ -28,7 +28,7 @@ THE SOFTWARE. #if __cplusplus #define COMPLEX_ADD_OP_OVERLOAD(type) \ -__device__ __host__ static type operator + (const type& lhs, const type& rhs) { \ +__device__ __host__ static inline type operator + (const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x + rhs.x ; \ ret.y = lhs.y + rhs.y ; \ @@ -36,7 +36,7 @@ __device__ __host__ static type operator + (const type& lhs, const type& rhs) { } #define COMPLEX_SUB_OP_OVERLOAD(type) \ -__device__ __host__ static type operator - (const type& lhs, const type& rhs) { \ +__device__ __host__ static inline type operator - (const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x - rhs.x; \ ret.y = lhs.y - rhs.y; \ @@ -44,7 +44,7 @@ __device__ __host__ static type operator - (const type& lhs, const type& rhs) { } #define COMPLEX_MUL_OP_OVERLOAD(type) \ -__device__ __host__ static type operator * (const type& lhs, const type& rhs) { \ +__device__ __host__ static inline type operator * (const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \ ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \ @@ -52,7 +52,7 @@ __device__ __host__ static type operator * (const type& lhs, const type& rhs) { } #define COMPLEX_DIV_OP_OVERLOAD(type) \ -__device__ __host__ static type operator / (const type& lhs, const type& rhs) { \ +__device__ __host__ static inline type operator / (const type& lhs, const type& rhs) { \ type ret; \ ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \ ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \ @@ -88,7 +88,7 @@ __device__ __host__ static inline type& operator /= (type& lhs, const type& rhs) } #define COMPLEX_SCALAR_PRODUCT(type, type1) \ -__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ type ret; \ ret.x = lhs.x * rhs; \ ret.y = lhs.y * rhs; \ diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index d256b3f32f..eb0f7bf61a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -226,6 +226,8 @@ __device__ int __all( int input); __device__ int __any( int input); __device__ unsigned long long int __ballot( int input); +#if __HIP_ARCH_GFX701__ == 0 + // warp shuffle functions #ifdef __cplusplus __device__ int __shfl(int input, int lane, int width=warpSize); @@ -247,6 +249,18 @@ __device__ float __shfl_down(float input, unsigned int lane_delta, int width); __device__ float __shfl_xor(float input, int lane_mask, int width); #endif +__device__ unsigned __hip_ds_bpermute(int index, unsigned src); +__device__ float __hip_ds_bpermutef(int index, float src); +__device__ unsigned __hip_ds_permute(int index, unsigned src); +__device__ float __hip_ds_permutef(int index, float src); + +__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern); +__device__ float __hip_ds_swizzlef(float src, int pattern); + +__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl); + +#endif + __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); @@ -321,16 +335,6 @@ __device__ static inline void __threadfence(void) { //__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details"))); __device__ void __threadfence_system(void) ; -__device__ unsigned __hip_ds_bpermute(int index, unsigned src); -__device__ float __hip_ds_bpermutef(int index, float src); -__device__ unsigned __hip_ds_permute(int index, unsigned src); -__device__ float __hip_ds_permutef(int index, float src); - -__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern); -__device__ float __hip_ds_swizzlef(float src, int pattern); - -__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl); - // doxygen end Fence Fence /** * @} diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index f156d3fdbd..7f85aad28d 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -62,7 +62,12 @@ typedef struct ihipStream_t *hipStream_t; #define hipIpcMemLazyEnablePeerAccess 0 -typedef struct ihipIpcMemHandle_t *hipIpcMemHandle_t; +#define HIP_IPC_HANDLE_SIZE 64 + +typedef struct hipIpcMemHandle_st +{ + char reserved[HIP_IPC_HANDLE_SIZE]; +}hipIpcMemHandle_t; //TODO: IPC event handle currently unsupported struct ihipIpcEventHandle_t; @@ -853,6 +858,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) * @param[out] ptr Pointer to the allocated memory * @param[in] size Requested memory size * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess * * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc @@ -865,6 +872,8 @@ hipError_t hipMalloc(void** ptr, size_t size) ; * @param[out] ptr Pointer to the allocated host pinned memory * @param[in] size Requested memory size * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @deprecated use hipHostMalloc() instead @@ -878,6 +887,8 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use * @param[in] size Requested memory size * @param[in] flags Type of host memory allocation * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @see hipSetDeviceFlags, hipHostFree @@ -891,6 +902,8 @@ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ; * @param[in] size Requested memory size * @param[in] flags Type of host memory allocation * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @deprecated use hipHostMalloc() instead @@ -975,6 +988,9 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[out] pitch Pitch for allocation (in bytes) * @param[in] width Requested pitched allocation width (in bytes) * @param[in] height Requested pitched allocation height + * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return Error code * * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc @@ -1236,6 +1252,9 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st hipError_t hipMemGetInfo (size_t * free, size_t * total) ; +hipError_t hipMemPtrGetInfo(void *ptr, size_t *size); + + /** * @brief Allocate an array on the device. * diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index cd5a09215a..8e6ec49511 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -1260,7 +1260,7 @@ __device__ __host__ static inline type& operator op (type& val) { \ } #define DECLOP_1VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int i) { \ +__device__ __host__ static inline type operator op (type& val, int) { \ type ret; \ ret.x = val.x; \ val.x op; \ @@ -1326,7 +1326,7 @@ __device__ __host__ static inline type& operator op (type& val) { \ } #define DECLOP_2VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int i) { \ +__device__ __host__ static inline type operator op (type& val, int) { \ type ret; \ ret.x = val.x; \ ret.y = val.y; \ @@ -1337,7 +1337,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \ #define DECLOP_2VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x && lhs.y op rhs.y; \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ } #define DECLOP_2VAR_1IN_1OUT(type, op) \ @@ -1350,7 +1350,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \ #define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ __device__ __host__ static inline bool operator op (type &rhs) { \ - return op rhs.x && op rhs.y; \ + return (op rhs.x) && (op rhs.y); \ } @@ -1401,7 +1401,7 @@ __device__ __host__ static inline type& operator op (type& val) { \ } #define DECLOP_3VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int i) { \ +__device__ __host__ static inline type operator op (type& val, int) { \ type ret; \ ret.x = val.x; \ ret.y = val.y; \ @@ -1414,7 +1414,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \ #define DECLOP_3VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ } #define DECLOP_3VAR_1IN_1OUT(type, op) \ @@ -1428,7 +1428,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \ #define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ __device__ __host__ static inline bool operator op (type &rhs) { \ - return op rhs.x && op rhs.y && op rhs.z; \ + return (op rhs.x) && (op rhs.y) && (op rhs.z); \ } @@ -1484,7 +1484,7 @@ __device__ __host__ static inline type& operator op (type& val) { \ } #define DECLOP_4VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int i) { \ +__device__ __host__ static inline type operator op (type& val, int) { \ type ret; \ ret.x = val.x; \ ret.y = val.y; \ @@ -1499,7 +1499,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \ #define DECLOP_4VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ } #define DECLOP_4VAR_1IN_1OUT(type, op) \ @@ -1514,7 +1514,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \ #define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ __device__ __host__ static inline bool operator op (type &rhs) { \ - return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \ + return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ } diff --git a/projects/clr/hipamd/include/hip/hip_common.h b/projects/clr/hipamd/include/hip/hip_common.h index 6223a2fe9e..6317a792ee 100644 --- a/projects/clr/hipamd/include/hip/hip_common.h +++ b/projects/clr/hipamd/include/hip/hip_common.h @@ -27,13 +27,6 @@ THE SOFTWARE. // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly #if defined(__HCC__) #define __HIP_PLATFORM_HCC__ - -#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0) -#define __HIP_DEVICE_COMPILE__ 1 -#else -#define __HIP_DEVICE_COMPILE__ 0 -#endif - #endif //__HCC__ // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC @@ -43,14 +36,12 @@ THE SOFTWARE. #define __HIPCC__ #endif -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0) -#define __HIP_DEVICE_COMPILE__ 1 -#else -#define __HIP_DEVICE_COMPILE__ 0 -#endif - #endif //__NVCC__ +// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path +#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0) + #define __HIP_DEVICE_COMPILE__ 1 +#endif #if __HIP_DEVICE_COMPILE__ == 0 // 32-bit Atomics diff --git a/projects/clr/hipamd/include/hip/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hip_runtime_api.h index 28d67fc01a..818c0b7c34 100644 --- a/projects/clr/hipamd/include/hip/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hip_runtime_api.h @@ -106,6 +106,7 @@ typedef struct hipDeviceProp_t { size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. int canMapHostMemory; ///< Check whether HIP can map host memory + int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701 } hipDeviceProp_t; diff --git a/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp b/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp index 0073c1399a..e694bfb8a4 100644 --- a/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp +++ b/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp @@ -32,7 +32,7 @@ THE SOFTWARE. }\ } -/* +/* * Square each element in the array A and write to array C. */ template @@ -58,16 +58,18 @@ int main(int argc, char *argv[]) hipDeviceProp_t props; CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); - + #ifdef __HIP_PLATFORM_HCC__ + printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch); + #endif printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); A_h = (float*)malloc(Nbytes); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); C_h = (float*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i - for (size_t i=0; iisMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1; // Get agent name -#if HIP_USE_PRODUCT_NAME + err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name)); -#else - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &(prop->name)); -#endif + char archName[256]; + err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName); + + if(strcmp(archName,"gfx701")==0){ + prop->gcnArch = 701; + } + if(strcmp(archName,"gfx801")==0){ + prop->gcnArch = 801; + } + if(strcmp(archName,"gfx802")==0){ + prop->gcnArch = 802; + } + if(strcmp(archName,"gfx803")==0){ + prop->gcnArch = 803; + } + DeviceErrorCheck(err); // Get agent node @@ -1790,6 +1803,20 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, } +void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo) +{ + tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n", + tag, ptr, + ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes, + ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged); +} + + +// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation. +// What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i +// The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur. +// Need to track Portable flag correctly or use new RT functionality to query the peer status for the pointer. +// // TODO - remove kind parm from here or use it below? void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { @@ -1806,6 +1833,16 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + + // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: + if (!dstTracked) { + assert (dstPtrInfo._sizeBytes == 0); + } + if (!srcTracked) { + assert (srcPtrInfo._sizeBytes == 0); + } + + hc::hcCommandKind hcCopyDir; ihipCtx_t *copyDevice; bool forceUnpinnedCopy; @@ -1818,12 +1855,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes, - dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem); - tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes, - srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem); + printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); + printPointerInfo(DB_COPY, " src", src, srcPtrInfo); this->ensureHaveQueue(crit); @@ -1908,12 +1941,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes, - dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem); - tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes, - srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem); + printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); + printPointerInfo(DB_COPY, " src", src, srcPtrInfo); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. diff --git a/projects/clr/hipamd/src/hip_hcc.h b/projects/clr/hipamd/src/hip_hcc.h index 105eef6bb8..b23aead072 100644 --- a/projects/clr/hipamd/src/hip_hcc.h +++ b/projects/clr/hipamd/src/hip_hcc.h @@ -36,7 +36,7 @@ THE SOFTWARE. #error("This version of HIP requires a newer version of HCC."); #endif -#define USE_IPC 0 +#define USE_IPC 1 //--- // Environment variables: @@ -326,15 +326,15 @@ const hipStream_t hipStreamNull = 0x0; /** * HIP IPC Handle Size */ -#define HIP_IPC_HANDLE_SIZE 64 +#define HIP_IPC_RESERVED_SIZE 24 class ihipIpcMemHandle_t { public: #if USE_IPC hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr #endif - char reserved[HIP_IPC_HANDLE_SIZE]; size_t psize; + char reserved[HIP_IPC_RESERVED_SIZE]; }; diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 479040c099..c6b9406778 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -30,11 +30,109 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_texture.h" #include + + +// Internal HIP APIS: +namespace hip_internal { + +hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) +{ + hipError_t e = hipSuccess; + + stream = ihipSyncAndResolveStream(stream); + + + if ((dst == NULL) || (src == NULL)) { + e= hipErrorInvalidValue; + } else if (stream) { + try { + stream->locked_copyAsync(dst, src, sizeBytes, kind); + } + catch (ihipException ex) { + e = ex._code; + } + } else { + e = hipErrorInvalidValue; + } + + return e; +} + +// return 0 on success or -1 on error: +int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +{ + int ret = 0; + + auto device = ctx->getWriteableDevice(); + + hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { + + //printf ("peer self access\n"); + + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } + } + + return ret; +} + + + + +// Allocate a new pointer with am_alloc and share with all valid peers. +// Returns null-ptr if a memory error occurs (either allocation or sharing) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +{ + + void *ptr = nullptr; + + auto device = ctx->getWriteableDevice(); + + ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); + tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n", + msg, ptr, sizeBytes, device->_deviceId); + + if (ptr != nullptr) { + int r = sharePtr(ptr, ctx, hipFlags); + if (r != 0) { + ptr = nullptr; + } + } + + return ptr; +} + + +} // end namespace hip_internal + //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------------------- // Memory // // +// +//HIP uses several "app*" fields HC memory tracker to track state necessary for the HIP API. +//_appId : DeviceID. For device mem, this is device where the memory is physically allocated. +// For host or registered mem, this is the current device when the memory is allocated or registered. This device will have a GPUVM mapping for the host mem. +// +//_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs. +// TODO - add more info here when available. +// hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) { HIP_INIT_API(attributes, ptr); @@ -78,6 +176,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) return ihipLogStatus(e); } + hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags) { HIP_INIT_API(devicePointer, hostPointer, flags); @@ -102,6 +201,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi return ihipLogStatus(e); } + hipError_t hipMalloc(void** ptr, size_t sizeBytes) { HIP_INIT_API(ptr, sizeBytes); @@ -118,37 +218,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (ctx) { auto device = ctx->getWriteableDevice(); - const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", - *ptr, sizeBytes, device->_deviceId, peerCnt-1); - if (peerCnt > 1) { - - //printf ("peer self access\n"); - - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; - - hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (e != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } } else { hip_status = hipErrorMemoryAllocation; } @@ -188,54 +259,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { auto device = ctx->getWriteableDevice(); - if(HIP_COHERENT_HOST_ALLOC){ - // Force to allocate finedgrained system memory - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes < 1 && (*ptr == NULL)){ - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); - } - tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr); - } - else{ - // TODO - am_alloc requires writeable __acc, perhaps could be refactored? - // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC. - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if (*ptr == NULL) { - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, flags); - // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - peerCnt = crit->peerCnt(); - if (peerCnt > 1) { - hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - } - } - tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); - } - } + unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", + sizeBytes, ctx, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } } } + if (HIP_SYNC_HOST_ALLOC) { hipDeviceSynchronize(); } return ihipLogStatus(hip_status); } +// Deprecated function: hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { return hipHostMalloc(ptr, sizeBytes, 0); } + +// Deprecated function: hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) { return hipHostMalloc(ptr, sizeBytes, flags); }; + // width in bytes hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { @@ -257,22 +310,11 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved: - hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (hsa_status != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } + } } else { hip_status = hipErrorMemoryAllocation; } @@ -306,41 +348,31 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, void ** ptr = &array[0]->data; if (ctx) { - auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; const size_t size = width*height; + size_t allocSize = 0; switch(desc->f) { case hipChannelFormatKindSigned: - *ptr = hc::am_alloc(size*sizeof(int), device->_acc, am_flags); + allocSize = size * sizeof(int); break; case hipChannelFormatKindUnsigned: - *ptr = hc::am_alloc(size*sizeof(unsigned int), device->_acc, am_flags); + allocSize = size * sizeof(unsigned int); break; case hipChannelFormatKindFloat: - *ptr = hc::am_alloc(size*sizeof(float), device->_acc, am_flags); + allocSize = size * sizeof(float); break; case hipChannelFormatKindNone: - *ptr = hc::am_alloc(size*sizeof(size_t), device->_acc, am_flags); + allocSize = size * sizeof(size_t); break; default: hip_status = hipErrorUnknown; break; } + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved: - hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (hsa_status != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } + } } else { hip_status = hipErrorMemoryAllocation; @@ -373,6 +405,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) return ihipLogStatus(hip_status); } + +// TODO - need to fix several issues here related to P2P access, host memory fallback. hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) { HIP_INIT_API(hostPtr, sizeBytes, flags); @@ -392,19 +426,21 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorHostMemoryAlreadyRegistered; } else { auto ctx = ihipGetTlsDefaultCtx(); - if(hostPtr == NULL){ + if (hostPtr == NULL) { return ihipLogStatus(hipErrorInvalidValue); } + //TODO-test : multi-gpu access to registered host memory. if (ctx) { - auto device = ctx->getWriteableDevice(); if(flags == hipHostRegisterDefault || flags == hipHostRegisterPortable || flags == hipHostRegisterMapped){ + auto device = ctx->getWriteableDevice(); std::vectorvecAcc; for(int i=0;i_acc); } am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size()); + hc::am_memtracker_update(hostPtr, device->_deviceId, flags); - tprintf(DB_MEM, " %s registered ptr=%p\n", __func__, hostPtr); + tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size()); if(am_status == AM_SUCCESS){ hip_status = hipSuccess; } else { @@ -603,6 +639,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind return ihipLogStatus(e); } + hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes) { HIP_INIT_CMD_API(dst, src, sizeBytes); @@ -624,6 +661,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes) return ihipLogStatus(e); } + hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes) { HIP_INIT_CMD_API(dst, src, sizeBytes); @@ -645,6 +683,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes) return ihipLogStatus(e); } + hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes) { HIP_INIT_CMD_API(dst, src, sizeBytes); @@ -666,6 +705,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte return ihipLogStatus(e); } + hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) { HIP_INIT_CMD_API(dst, src, sizeBytes); @@ -689,32 +729,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) -// Internal copy sync: -namespace hip_internal { - -hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) -{ - hipError_t e = hipSuccess; - - stream = ihipSyncAndResolveStream(stream); - - - if ((dst == NULL) || (src == NULL)) { - e= hipErrorInvalidValue; - } else if (stream) { - try { - stream->locked_copyAsync(dst, src, sizeBytes, kind); - } - catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return e; -} -} // end namespace hip_internal hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) @@ -990,6 +1004,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) return ihipLogStatus(e); } + hipError_t hipMemGetInfo (size_t *free, size_t *total) { HIP_INIT_API(free, total); @@ -1024,6 +1039,28 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) return ihipLogStatus(e); } +hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) +{ + HIP_INIT_API(ptr, size); + + hipError_t e = hipSuccess; + + if(ptr != nullptr && size != nullptr){ + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if(status == AM_SUCCESS){ + *size = amPointerInfo._sizeBytes; + }else{ + e = hipErrorInvalidValue; + } + }else{ + e = hipErrorInvalidValue; + } + return ihipLogStatus(e); +} + + hipError_t hipFree(void* ptr) { HIP_INIT_API(ptr); @@ -1051,6 +1088,7 @@ hipError_t hipFree(void* ptr) return ihipLogStatus(hipStatus); } + hipError_t hipHostFree(void* ptr) { HIP_INIT_API(ptr); @@ -1122,7 +1160,7 @@ hipError_t hipMemGetAddressRange ( hipDeviceptr_t* pbase, size_t* psize, hipDevi } else hipStatus = hipErrorInvalidDevicePointer; - return hipStatus; + return ihipLogStatus(hipStatus); } @@ -1141,25 +1179,25 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ } else hipStatus = hipErrorInvalidResourceHandle; - + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle; // Save the size of the pointer to hipIpcMemHandle - (*handle)->psize = psize; + iHandle->psize = psize; #if USE_IPC // Create HSA ipc memory hsa_status_t hsa_status = - hsa_amd_ipc_memory_create(devPtr, psize, &(*handle)->ipc_handle); + hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle)); if(hsa_status!= HSA_STATUS_SUCCESS) hipStatus = hipErrorMemoryAllocation; #else hipStatus = hipErrorRuntimeOther; #endif - return hipStatus; + return ihipLogStatus(hipStatus); } hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){ -// HIP_INIT_API ( devPtr, handle.handle , flags); + HIP_INIT_API ( devPtr, &handle , flags); hipError_t hipStatus = hipSuccess; #if USE_IPC @@ -1169,15 +1207,16 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned if(!agent) return hipErrorInvalidResourceHandle; + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; //Attach ipc memory hsa_status_t hsa_status = - hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr); + hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr); if(hsa_status != HSA_STATUS_SUCCESS) hipStatus = hipErrorMapBufferObjectFailed; #else hipStatus = hipErrorRuntimeOther; #endif - return hipStatus; + return ihipLogStatus(hipStatus); } hipError_t hipIpcCloseMemHandle(void *devPtr){ @@ -1192,7 +1231,7 @@ hipError_t hipIpcCloseMemHandle(void *devPtr){ #else hipStatus = hipErrorRuntimeOther; #endif - return hipStatus; + return ihipLogStatus(hipStatus); } // hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){ diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index f21adf9691..1f20a47c13 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -218,31 +218,33 @@ hipError_t hipModuleUnload(hipModule_t hmod) { ret = hipErrorInvalidValue; } - for(std::list::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { + for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { delete *f; } delete hmod; return ihipLogStatus(ret); } -hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name){ + +hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name) +{ auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; - if(name == nullptr){ + if (name == nullptr){ return ihipLogStatus(hipErrorInvalidValue); } - if(ctx == nullptr){ + if (ctx == nullptr){ ret = hipErrorInvalidContext; - }else{ + } else { std::string str(name); - for(std::list::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { - if((*f)->_name == str) { - *func = *f; - } - return ret; + for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) { + if((*f)->_name == str) { + *func = *f; + return ret; + } } ihipModuleSymbol_t *sym = new ihipModuleSymbol_t; int deviceId = ctx->getDevice()->_deviceId; diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp index ff876def5f..6e919b3926 100644 --- a/projects/clr/hipamd/src/math_functions.cpp +++ b/projects/clr/hipamd/src/math_functions.cpp @@ -46,7 +46,7 @@ __device__ float asinhf(float x) } __device__ float atan2f(float y, float x) { - return hc::precise_math::atan2f(x, y); + return hc::precise_math::atan2f(y, x); } __device__ float atanf(float x) { diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp index a989b879ac..9fad60aec8 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp index 37ee9b1b78..1a1319c500 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -39,22 +39,67 @@ int main(){ const size_t size = N * sizeof(float); A = (float*)malloc(size); HIPCHECK(hipHostRegister(A, size, 0)); + + for(int i=0;i