Merge branch 'amd-develop' into amd-master
Change-Id: I8921e67e352e35e4c496e78a797fb309279ab7d0
[ROCm/clr commit: 62870fdc39]
Dieser Commit ist enthalten in:
@@ -189,7 +189,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
|
||||
execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900")
|
||||
add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
|
||||
target_link_libraries(hip_hcc PRIVATE hc_am)
|
||||
add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})
|
||||
|
||||
@@ -23,8 +23,8 @@ use File::Basename;
|
||||
# HSA_PATH : Path to HSA dir (default /opt/rocm/hsa). Used on AMD platforms only.
|
||||
|
||||
if(scalar @ARGV == 0){
|
||||
print "No Arguments passed, exiting ...\n";
|
||||
exit(-1);
|
||||
print "No Arguments passed, exiting ...\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
#---
|
||||
@@ -74,6 +74,7 @@ $target_gfx701 = 0;
|
||||
$target_gfx801 = 0;
|
||||
$target_gfx802 = 0;
|
||||
$target_gfx803 = 0;
|
||||
$target_gfx900 = 0;
|
||||
|
||||
if ($HIP_PLATFORM eq "hcc") {
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa";
|
||||
@@ -189,18 +190,18 @@ if ($verbose & 0x4) {
|
||||
# Handle code object generation
|
||||
my $ISACMD="";
|
||||
if($HIP_PLATFORM eq "hcc"){
|
||||
$ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
|
||||
if($ARGV[0] eq "--genco"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
$ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
|
||||
if($ARGV[0] eq "--genco"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if(($HIP_PLATFORM eq "hcc")){
|
||||
@@ -210,18 +211,18 @@ if(($HIP_PLATFORM eq "hcc")){
|
||||
}
|
||||
|
||||
if($HIP_PLATFORM eq "nvcc"){
|
||||
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
|
||||
if($ARGV[0] eq "--genco"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
|
||||
if($ARGV[0] eq "--genco"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
|
||||
@@ -247,20 +248,25 @@ foreach $arg (@ARGV)
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx701')
|
||||
{
|
||||
$target_gfx701 = 1;
|
||||
$target_gfx701 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx801')
|
||||
{
|
||||
$target_gfx801 = 1;
|
||||
$target_gfx801 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx802')
|
||||
{
|
||||
$target_gfx802 = 1;
|
||||
$target_gfx802 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx803')
|
||||
{
|
||||
$target_gfx803 = 1;
|
||||
$target_gfx803 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx900')
|
||||
{
|
||||
$target_gfx900 = 1;
|
||||
}
|
||||
|
||||
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
|
||||
{
|
||||
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
|
||||
@@ -320,6 +326,33 @@ foreach $arg (@ARGV)
|
||||
}
|
||||
$toolArgs .= " $arg" unless $swallowArg;
|
||||
}
|
||||
foreach my $target (split(/,/, $ENV{HCC_AMDGPU_TARGET}))
|
||||
{
|
||||
if($target eq 'gfx701')
|
||||
{
|
||||
$target_gfx701 = 1;
|
||||
}
|
||||
if($target eq 'gfx801')
|
||||
{
|
||||
$target_gfx801 = 1;
|
||||
}
|
||||
if($target eq 'gfx802')
|
||||
{
|
||||
$target_gfx802 = 1;
|
||||
}
|
||||
if($target eq 'gfx803')
|
||||
{
|
||||
$target_gfx803 = 1;
|
||||
}
|
||||
if($target eq 'gfx900')
|
||||
{
|
||||
$target_gfx900 = 1;
|
||||
}
|
||||
}
|
||||
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0 and $target_gfx900 eq 0)
|
||||
{
|
||||
$target_gfx803 = 1;
|
||||
}
|
||||
|
||||
if($HIP_PLATFORM eq "hcc"){
|
||||
|
||||
@@ -343,12 +376,10 @@ if($HIP_PLATFORM eq "hcc"){
|
||||
$HIPCXXFLAGS .= " -D__HIP_ARCH_GFX803__=1 ";
|
||||
$ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
|
||||
}
|
||||
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0)
|
||||
{
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803";
|
||||
$ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
|
||||
if ($target_gfx900 eq 1) {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx900";
|
||||
$HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
|
||||
@@ -407,3 +438,5 @@ if ($runCmd) {
|
||||
}
|
||||
system ("$CMD") and die ();
|
||||
}
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
|
||||
@@ -166,10 +166,10 @@ Both nvcc and hcc make two passes over the code: one for host code and one for d
|
||||
|
||||
```
|
||||
// #ifdef __CUDA_ARCH__
|
||||
#ifdef __HIP_DEVICE_COMPILE__ && (__HIP_DEVICE_COMPILE__ == 1)
|
||||
#if __HIP_DEVICE_COMPILE__
|
||||
```
|
||||
|
||||
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doesnt represent the feature capability of the target device.
|
||||
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesnt represent the feature capability of the target device.
|
||||
|
||||
|
||||
### Compiler Defines: Summary
|
||||
@@ -178,7 +178,7 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doe
|
||||
|HIP-related defines:|
|
||||
|`__HIP_PLATFORM_HCC___`| Defined | Undefined | Defined if targeting hcc platform; undefined otherwise |
|
||||
|`__HIP_PLATFORM_NVCC___`| Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise |
|
||||
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; 0 if compiling for host |1 if compiling for device; 0 if compiling for host | Undefined
|
||||
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined
|
||||
|`__HIPCC__` | Defined | Defined | Undefined
|
||||
|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
|
||||
|nvcc-related defines:|
|
||||
|
||||
@@ -28,7 +28,7 @@ THE SOFTWARE.
|
||||
|
||||
#if __cplusplus
|
||||
#define COMPLEX_ADD_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static type operator + (const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ static inline type operator + (const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x + rhs.x ; \
|
||||
ret.y = lhs.y + rhs.y ; \
|
||||
@@ -36,7 +36,7 @@ __device__ __host__ static type operator + (const type& lhs, const type& rhs) {
|
||||
}
|
||||
|
||||
#define COMPLEX_SUB_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static type operator - (const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ static inline type operator - (const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x - rhs.x; \
|
||||
ret.y = lhs.y - rhs.y; \
|
||||
@@ -44,7 +44,7 @@ __device__ __host__ static type operator - (const type& lhs, const type& rhs) {
|
||||
}
|
||||
|
||||
#define COMPLEX_MUL_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static type operator * (const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ static inline type operator * (const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
|
||||
ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
|
||||
@@ -52,7 +52,7 @@ __device__ __host__ static type operator * (const type& lhs, const type& rhs) {
|
||||
}
|
||||
|
||||
#define COMPLEX_DIV_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static type operator / (const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ static inline type operator / (const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
|
||||
ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
|
||||
@@ -88,7 +88,7 @@ __device__ __host__ static inline type& operator /= (type& lhs, const type& rhs)
|
||||
}
|
||||
|
||||
#define COMPLEX_SCALAR_PRODUCT(type, type1) \
|
||||
__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
|
||||
__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x * rhs; \
|
||||
ret.y = lhs.y * rhs; \
|
||||
|
||||
@@ -226,6 +226,8 @@ __device__ int __all( int input);
|
||||
__device__ int __any( int input);
|
||||
__device__ unsigned long long int __ballot( int input);
|
||||
|
||||
#if __HIP_ARCH_GFX701__ == 0
|
||||
|
||||
// warp shuffle functions
|
||||
#ifdef __cplusplus
|
||||
__device__ int __shfl(int input, int lane, int width=warpSize);
|
||||
@@ -247,6 +249,18 @@ __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
|
||||
__device__ float __shfl_xor(float input, int lane_mask, int width);
|
||||
#endif
|
||||
|
||||
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
|
||||
__device__ float __hip_ds_bpermutef(int index, float src);
|
||||
__device__ unsigned __hip_ds_permute(int index, unsigned src);
|
||||
__device__ float __hip_ds_permutef(int index, float src);
|
||||
|
||||
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
|
||||
__device__ float __hip_ds_swizzlef(float src, int pattern);
|
||||
|
||||
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
|
||||
|
||||
#endif
|
||||
|
||||
__host__ __device__ int min(int arg1, int arg2);
|
||||
__host__ __device__ int max(int arg1, int arg2);
|
||||
|
||||
@@ -321,16 +335,6 @@ __device__ static inline void __threadfence(void) {
|
||||
//__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
|
||||
__device__ void __threadfence_system(void) ;
|
||||
|
||||
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
|
||||
__device__ float __hip_ds_bpermutef(int index, float src);
|
||||
__device__ unsigned __hip_ds_permute(int index, unsigned src);
|
||||
__device__ float __hip_ds_permutef(int index, float src);
|
||||
|
||||
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
|
||||
__device__ float __hip_ds_swizzlef(float src, int pattern);
|
||||
|
||||
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
|
||||
|
||||
// doxygen end Fence Fence
|
||||
/**
|
||||
* @}
|
||||
|
||||
@@ -62,7 +62,12 @@ typedef struct ihipStream_t *hipStream_t;
|
||||
|
||||
#define hipIpcMemLazyEnablePeerAccess 0
|
||||
|
||||
typedef struct ihipIpcMemHandle_t *hipIpcMemHandle_t;
|
||||
#define HIP_IPC_HANDLE_SIZE 64
|
||||
|
||||
typedef struct hipIpcMemHandle_st
|
||||
{
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
}hipIpcMemHandle_t;
|
||||
|
||||
//TODO: IPC event handle currently unsupported
|
||||
struct ihipIpcEventHandle_t;
|
||||
@@ -853,6 +858,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
|
||||
* @param[out] ptr Pointer to the allocated memory
|
||||
* @param[in] size Requested memory size
|
||||
*
|
||||
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
|
||||
*
|
||||
* @return #hipSuccess
|
||||
*
|
||||
* @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc
|
||||
@@ -865,6 +872,8 @@ hipError_t hipMalloc(void** ptr, size_t size) ;
|
||||
* @param[out] ptr Pointer to the allocated host pinned memory
|
||||
* @param[in] size Requested memory size
|
||||
*
|
||||
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
|
||||
*
|
||||
* @return #hipSuccess, #hipErrorMemoryAllocation
|
||||
*
|
||||
* @deprecated use hipHostMalloc() instead
|
||||
@@ -878,6 +887,8 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use
|
||||
* @param[in] size Requested memory size
|
||||
* @param[in] flags Type of host memory allocation
|
||||
*
|
||||
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
|
||||
*
|
||||
* @return #hipSuccess, #hipErrorMemoryAllocation
|
||||
*
|
||||
* @see hipSetDeviceFlags, hipHostFree
|
||||
@@ -891,6 +902,8 @@ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ;
|
||||
* @param[in] size Requested memory size
|
||||
* @param[in] flags Type of host memory allocation
|
||||
*
|
||||
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
|
||||
*
|
||||
* @return #hipSuccess, #hipErrorMemoryAllocation
|
||||
*
|
||||
* @deprecated use hipHostMalloc() instead
|
||||
@@ -975,6 +988,9 @@ hipError_t hipHostUnregister(void* hostPtr) ;
|
||||
* @param[out] pitch Pitch for allocation (in bytes)
|
||||
* @param[in] width Requested pitched allocation width (in bytes)
|
||||
* @param[in] height Requested pitched allocation height
|
||||
*
|
||||
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
|
||||
*
|
||||
* @return Error code
|
||||
*
|
||||
* @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc
|
||||
@@ -1236,6 +1252,9 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st
|
||||
hipError_t hipMemGetInfo (size_t * free, size_t * total) ;
|
||||
|
||||
|
||||
hipError_t hipMemPtrGetInfo(void *ptr, size_t *size);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Allocate an array on the device.
|
||||
*
|
||||
|
||||
@@ -1260,7 +1260,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
|
||||
}
|
||||
|
||||
#define DECLOP_1VAR_POSTOP(type, op) \
|
||||
__device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
__device__ __host__ static inline type operator op (type& val, int) { \
|
||||
type ret; \
|
||||
ret.x = val.x; \
|
||||
val.x op; \
|
||||
@@ -1326,7 +1326,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
|
||||
}
|
||||
|
||||
#define DECLOP_2VAR_POSTOP(type, op) \
|
||||
__device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
__device__ __host__ static inline type operator op (type& val, int) { \
|
||||
type ret; \
|
||||
ret.x = val.x; \
|
||||
ret.y = val.y; \
|
||||
@@ -1337,7 +1337,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
|
||||
#define DECLOP_2VAR_COMP(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
|
||||
return lhs.x op rhs.x && lhs.y op rhs.y; \
|
||||
return (lhs.x op rhs.x) && (lhs.y op rhs.y); \
|
||||
}
|
||||
|
||||
#define DECLOP_2VAR_1IN_1OUT(type, op) \
|
||||
@@ -1350,7 +1350,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
|
||||
|
||||
#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type &rhs) { \
|
||||
return op rhs.x && op rhs.y; \
|
||||
return (op rhs.x) && (op rhs.y); \
|
||||
}
|
||||
|
||||
|
||||
@@ -1401,7 +1401,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
|
||||
}
|
||||
|
||||
#define DECLOP_3VAR_POSTOP(type, op) \
|
||||
__device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
__device__ __host__ static inline type operator op (type& val, int) { \
|
||||
type ret; \
|
||||
ret.x = val.x; \
|
||||
ret.y = val.y; \
|
||||
@@ -1414,7 +1414,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
|
||||
#define DECLOP_3VAR_COMP(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
|
||||
return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \
|
||||
return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \
|
||||
}
|
||||
|
||||
#define DECLOP_3VAR_1IN_1OUT(type, op) \
|
||||
@@ -1428,7 +1428,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
|
||||
|
||||
#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type &rhs) { \
|
||||
return op rhs.x && op rhs.y && op rhs.z; \
|
||||
return (op rhs.x) && (op rhs.y) && (op rhs.z); \
|
||||
}
|
||||
|
||||
|
||||
@@ -1484,7 +1484,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
|
||||
}
|
||||
|
||||
#define DECLOP_4VAR_POSTOP(type, op) \
|
||||
__device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
__device__ __host__ static inline type operator op (type& val, int) { \
|
||||
type ret; \
|
||||
ret.x = val.x; \
|
||||
ret.y = val.y; \
|
||||
@@ -1499,7 +1499,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
|
||||
|
||||
#define DECLOP_4VAR_COMP(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
|
||||
return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \
|
||||
return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \
|
||||
}
|
||||
|
||||
#define DECLOP_4VAR_1IN_1OUT(type, op) \
|
||||
@@ -1514,7 +1514,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
|
||||
|
||||
#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \
|
||||
__device__ __host__ static inline bool operator op (type &rhs) { \
|
||||
return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \
|
||||
return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -27,13 +27,6 @@ THE SOFTWARE.
|
||||
// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
|
||||
#if defined(__HCC__)
|
||||
#define __HIP_PLATFORM_HCC__
|
||||
|
||||
#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
|
||||
#define __HIP_DEVICE_COMPILE__ 1
|
||||
#else
|
||||
#define __HIP_DEVICE_COMPILE__ 0
|
||||
#endif
|
||||
|
||||
#endif //__HCC__
|
||||
|
||||
// Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
|
||||
@@ -43,14 +36,12 @@ THE SOFTWARE.
|
||||
#define __HIPCC__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)
|
||||
#define __HIP_DEVICE_COMPILE__ 1
|
||||
#else
|
||||
#define __HIP_DEVICE_COMPILE__ 0
|
||||
#endif
|
||||
|
||||
#endif //__NVCC__
|
||||
|
||||
// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
|
||||
#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
|
||||
#define __HIP_DEVICE_COMPILE__ 1
|
||||
#endif
|
||||
|
||||
#if __HIP_DEVICE_COMPILE__ == 0
|
||||
// 32-bit Atomics
|
||||
|
||||
@@ -106,6 +106,7 @@ typedef struct hipDeviceProp_t {
|
||||
size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor.
|
||||
int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not.
|
||||
int canMapHostMemory; ///< Check whether HIP can map host memory
|
||||
int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701
|
||||
} hipDeviceProp_t;
|
||||
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ THE SOFTWARE.
|
||||
}\
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Square each element in the array A and write to array C.
|
||||
*/
|
||||
template <typename T>
|
||||
@@ -58,16 +58,18 @@ int main(int argc, char *argv[])
|
||||
hipDeviceProp_t props;
|
||||
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
|
||||
printf ("info: running on device %s\n", props.name);
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
|
||||
#endif
|
||||
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = (float*)malloc(Nbytes);
|
||||
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
C_h = (float*)malloc(Nbytes);
|
||||
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
// Fill with Phi + i
|
||||
for (size_t i=0; i<N; i++)
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
A_h[i] = 1.618f + i;
|
||||
A_h[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
|
||||
@@ -6,9 +6,12 @@
|
||||
|
||||
#include "ResultDatabase.h"
|
||||
|
||||
enum MallocMode {MallocPinned, MallocUnpinned, MallocRegistered};
|
||||
|
||||
// Cmdline parms:
|
||||
bool p_verbose = false;
|
||||
bool p_pinned = true;
|
||||
MallocMode p_malloc_mode = MallocPinned;
|
||||
int p_numa_ctl = -1;
|
||||
int p_iterations = 10;
|
||||
int p_beatsperiteration=1;
|
||||
int p_device = 0;
|
||||
@@ -21,7 +24,7 @@ bool p_h2d = true;
|
||||
bool p_d2h = true;
|
||||
bool p_bidir = true;
|
||||
|
||||
|
||||
//#define NO_CHECK
|
||||
|
||||
|
||||
#define CHECK_HIP_ERROR() \
|
||||
@@ -36,6 +39,14 @@ bool p_bidir = true;
|
||||
}
|
||||
|
||||
|
||||
std::string mallocModeString(int mallocMode) {
|
||||
switch (mallocMode) {
|
||||
case MallocPinned : return "pinned";
|
||||
case MallocUnpinned: return "unpinned";
|
||||
case MallocRegistered: return "registered";
|
||||
default: return "mallocmode-UNKNOWN";
|
||||
};
|
||||
};
|
||||
|
||||
// ****************************************************************************
|
||||
int sizeToBytes(int size) {
|
||||
@@ -106,7 +117,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
|
||||
// Create some host memory pattern
|
||||
float *hostMem = NULL;
|
||||
if (p_pinned)
|
||||
if (p_malloc_mode == MallocPinned)
|
||||
{
|
||||
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
|
||||
while (hipGetLastError() != hipSuccess)
|
||||
@@ -116,20 +127,33 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
|
||||
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (p_malloc_mode == MallocUnpinned)
|
||||
{
|
||||
if (p_alignedhost) {
|
||||
hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float));
|
||||
} else {
|
||||
hostMem = new float[numMaxFloats];
|
||||
}
|
||||
}
|
||||
else if (p_malloc_mode == MallocRegistered)
|
||||
{
|
||||
if (p_numa_ctl == -1) {
|
||||
hostMem = (float*)malloc(numMaxFloats*sizeof(float));
|
||||
}
|
||||
|
||||
hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numMaxFloats; i++)
|
||||
@@ -146,7 +170,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any device buffer\n";
|
||||
std::cerr << "Error: Couldn't allocate any device buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
@@ -199,8 +223,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
@@ -212,6 +236,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
|
||||
}
|
||||
|
||||
#ifndef NO_CHECK
|
||||
|
||||
// Check. First reset the host memory, then copy-back result. Then compare against original ref value.
|
||||
for (int i = 0; i < numMaxFloats; i++)
|
||||
{
|
||||
@@ -225,24 +251,36 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// Cleanup
|
||||
hipFree((void*)device);
|
||||
CHECK_HIP_ERROR();
|
||||
if (p_pinned)
|
||||
{
|
||||
switch (p_malloc_mode) {
|
||||
case MallocPinned:
|
||||
hipHostFree((void*)hostMem);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
|
||||
case MallocUnpinned:
|
||||
if (p_alignedhost) {
|
||||
delete[] hostMem;
|
||||
} else {
|
||||
free(hostMem);
|
||||
}
|
||||
break;
|
||||
|
||||
case MallocRegistered:
|
||||
hipHostUnregister(hostMem);
|
||||
CHECK_HIP_ERROR();
|
||||
free(hostMem);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
}
|
||||
@@ -257,38 +295,56 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
|
||||
// Create some host memory pattern
|
||||
float *hostMem1;
|
||||
float *hostMem2;
|
||||
if (p_pinned)
|
||||
if (p_malloc_mode == MallocPinned)
|
||||
{
|
||||
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
|
||||
hipError_t err1 = hipGetLastError();
|
||||
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
|
||||
hipError_t err2 = hipGetLastError();
|
||||
while (err1 != hipSuccess || err2 != hipSuccess)
|
||||
{
|
||||
// free the first buffer if only the second failed
|
||||
if (err1 == hipSuccess)
|
||||
hipHostFree((void*)hostMem1);
|
||||
while (err1 != hipSuccess || err2 != hipSuccess)
|
||||
{
|
||||
// free the first buffer if only the second failed
|
||||
if (err1 == hipSuccess)
|
||||
hipHostFree((void*)hostMem1);
|
||||
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
|
||||
err1 = hipGetLastError();
|
||||
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
|
||||
err2 = hipGetLastError();
|
||||
}
|
||||
}
|
||||
else
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
|
||||
err1 = hipGetLastError();
|
||||
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
|
||||
err2 = hipGetLastError();
|
||||
}
|
||||
}
|
||||
else if (p_malloc_mode == MallocUnpinned)
|
||||
{
|
||||
hostMem1 = new float[numMaxFloats];
|
||||
hostMem2 = new float[numMaxFloats];
|
||||
}
|
||||
else if (p_malloc_mode == MallocRegistered)
|
||||
{
|
||||
if (p_numa_ctl == -1) {
|
||||
hostMem1 = (float*)malloc(numMaxFloats*sizeof(float));
|
||||
hostMem2 = (float*)malloc(numMaxFloats*sizeof(float));
|
||||
}
|
||||
|
||||
hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0);
|
||||
CHECK_HIP_ERROR();
|
||||
hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i<numMaxFloats; i++)
|
||||
hostMem1[i] = i % 77;
|
||||
|
||||
@@ -301,7 +357,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any device buffer\n";
|
||||
std::cerr << "Error: Couldn't allocate any device buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
@@ -358,8 +414,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "ms", t);
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
@@ -381,20 +437,31 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
|
||||
// Cleanup
|
||||
hipFree((void*)device);
|
||||
CHECK_HIP_ERROR();
|
||||
if (p_pinned)
|
||||
{
|
||||
|
||||
switch (p_malloc_mode) {
|
||||
case MallocPinned:
|
||||
hipHostFree((void*)hostMem1);
|
||||
CHECK_HIP_ERROR();
|
||||
hipHostFree((void*)hostMem2);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
case MallocUnpinned:
|
||||
delete[] hostMem1;
|
||||
delete[] hostMem2;
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
break;
|
||||
case MallocRegistered:
|
||||
hipHostUnregister(hostMem1);
|
||||
CHECK_HIP_ERROR();
|
||||
free(hostMem1);
|
||||
hipHostUnregister(hostMem2);
|
||||
free(hostMem2);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
}
|
||||
|
||||
|
||||
@@ -409,7 +476,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
|
||||
// Create some host memory pattern
|
||||
float *hostMem[2] = {NULL, NULL};
|
||||
if (p_pinned)
|
||||
if (p_malloc_mode == MallocPinned)
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
@@ -424,18 +491,34 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
|
||||
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (p_malloc_mode == MallocUnpinned)
|
||||
{
|
||||
hostMem[0] = new float[numMaxFloats];
|
||||
hostMem[1] = new float[numMaxFloats];
|
||||
}
|
||||
else if (p_malloc_mode == MallocRegistered)
|
||||
{
|
||||
if (p_numa_ctl == -1) {
|
||||
hostMem[0] = (float*)malloc(numMaxFloats*sizeof(float));
|
||||
hostMem[1] = (float*)malloc(numMaxFloats*sizeof(float));
|
||||
}
|
||||
hipHostRegister(hostMem[0], numMaxFloats * sizeof(float), 0);
|
||||
CHECK_HIP_ERROR();
|
||||
hipHostRegister(hostMem[1], numMaxFloats * sizeof(float), 0);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < numMaxFloats; i++)
|
||||
{
|
||||
@@ -459,7 +542,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any device buffer\n";
|
||||
std::cerr << "Error: Couldn't allocate any device buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
@@ -512,8 +595,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
|
||||
resultDB.AddResult(std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -521,17 +604,27 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
hipFree((void*)deviceMem[0]);
|
||||
hipFree((void*)deviceMem[1]);
|
||||
CHECK_HIP_ERROR();
|
||||
if (p_pinned)
|
||||
{
|
||||
switch (p_malloc_mode) {
|
||||
case MallocPinned:
|
||||
hipHostFree((void*)hostMem[0]);
|
||||
hipHostFree((void*)hostMem[1]);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
case MallocUnpinned:
|
||||
delete[] hostMem[0];
|
||||
delete[] hostMem[1];
|
||||
}
|
||||
break;
|
||||
case MallocRegistered:
|
||||
for (int i=0; i<2; i++) {
|
||||
hipHostUnregister(hostMem[i]);
|
||||
CHECK_HIP_ERROR();
|
||||
free(hostMem[i]);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
};
|
||||
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
hipStreamDestroy(stream[0]);
|
||||
@@ -557,7 +650,7 @@ void printConfig() {
|
||||
hipDeviceProp_t props;
|
||||
hipGetDeviceProperties(&props, p_device);
|
||||
|
||||
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz Pinned=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, p_pinned ? "YES" : "NO");
|
||||
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz MallocMode=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, mallocModeString(p_malloc_mode).c_str());
|
||||
}
|
||||
|
||||
void help() {
|
||||
@@ -601,7 +694,9 @@ int parseStandardArguments(int argc, char *argv[])
|
||||
failed("Bad onesize argument");
|
||||
}
|
||||
} else if (!strcmp(arg, "--unpinned")) {
|
||||
p_pinned = 0;
|
||||
p_malloc_mode = MallocUnpinned;
|
||||
} else if (!strcmp(arg, "--registered")) {
|
||||
p_malloc_mode = MallocRegistered;
|
||||
} else if (!strcmp(arg, "--h2d")) {
|
||||
p_h2d = true;
|
||||
p_d2h = false;
|
||||
|
||||
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
|
||||
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
|
||||
$(error gfx701 is not a supported device for this sample)
|
||||
endif
|
||||
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
TARGET=hcc
|
||||
@@ -22,7 +26,7 @@ CXX=$(HIPCC)
|
||||
|
||||
|
||||
$(EXECUTABLE): $(OBJECTS)
|
||||
$(HIPCC) $(OBJECTS) -o $@
|
||||
$(HIPCC) $(OBJECTS) -o $@
|
||||
|
||||
|
||||
test: $(EXECUTABLE)
|
||||
@@ -33,4 +37,3 @@ clean:
|
||||
rm -f $(EXECUTABLE)
|
||||
rm -f $(OBJECTS)
|
||||
rm -f $(HIP_PATH)/src/*.o
|
||||
|
||||
|
||||
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
|
||||
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
|
||||
$(error gfx701 is not a supported device for this sample)
|
||||
endif
|
||||
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
TARGET=hcc
|
||||
|
||||
@@ -758,11 +758,24 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
|
||||
prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1;
|
||||
|
||||
// Get agent name
|
||||
#if HIP_USE_PRODUCT_NAME
|
||||
|
||||
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name));
|
||||
#else
|
||||
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &(prop->name));
|
||||
#endif
|
||||
char archName[256];
|
||||
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);
|
||||
|
||||
if(strcmp(archName,"gfx701")==0){
|
||||
prop->gcnArch = 701;
|
||||
}
|
||||
if(strcmp(archName,"gfx801")==0){
|
||||
prop->gcnArch = 801;
|
||||
}
|
||||
if(strcmp(archName,"gfx802")==0){
|
||||
prop->gcnArch = 802;
|
||||
}
|
||||
if(strcmp(archName,"gfx803")==0){
|
||||
prop->gcnArch = 803;
|
||||
}
|
||||
|
||||
DeviceErrorCheck(err);
|
||||
|
||||
// Get agent node
|
||||
@@ -1790,6 +1803,20 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
|
||||
}
|
||||
|
||||
|
||||
void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo)
|
||||
{
|
||||
tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n",
|
||||
tag, ptr,
|
||||
ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes,
|
||||
ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged);
|
||||
}
|
||||
|
||||
|
||||
// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation.
|
||||
// What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i
|
||||
// The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur.
|
||||
// Need to track Portable flag correctly or use new RT functionality to query the peer status for the pointer.
|
||||
//
|
||||
// TODO - remove kind parm from here or use it below?
|
||||
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
|
||||
{
|
||||
@@ -1806,6 +1833,16 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
|
||||
bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
|
||||
bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
|
||||
|
||||
|
||||
// Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here:
|
||||
if (!dstTracked) {
|
||||
assert (dstPtrInfo._sizeBytes == 0);
|
||||
}
|
||||
if (!srcTracked) {
|
||||
assert (srcPtrInfo._sizeBytes == 0);
|
||||
}
|
||||
|
||||
|
||||
hc::hcCommandKind hcCopyDir;
|
||||
ihipCtx_t *copyDevice;
|
||||
bool forceUnpinnedCopy;
|
||||
@@ -1818,12 +1855,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
|
||||
dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
|
||||
src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
|
||||
sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
|
||||
tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
|
||||
dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
|
||||
dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
|
||||
tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
|
||||
src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
|
||||
srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
|
||||
printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo);
|
||||
printPointerInfo(DB_COPY, " src", src, srcPtrInfo);
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
|
||||
@@ -1908,12 +1941,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
|
||||
dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
|
||||
src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
|
||||
sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
|
||||
tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
|
||||
dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
|
||||
dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
|
||||
tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
|
||||
src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
|
||||
srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
|
||||
printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo);
|
||||
printPointerInfo(DB_COPY, " src", src, srcPtrInfo);
|
||||
|
||||
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
|
||||
// If both pointers are not tracked, we need to fall back to a sync copy.
|
||||
|
||||
@@ -36,7 +36,7 @@ THE SOFTWARE.
|
||||
#error("This version of HIP requires a newer version of HCC.");
|
||||
#endif
|
||||
|
||||
#define USE_IPC 0
|
||||
#define USE_IPC 1
|
||||
|
||||
//---
|
||||
// Environment variables:
|
||||
@@ -326,15 +326,15 @@ const hipStream_t hipStreamNull = 0x0;
|
||||
/**
|
||||
* HIP IPC Handle Size
|
||||
*/
|
||||
#define HIP_IPC_HANDLE_SIZE 64
|
||||
#define HIP_IPC_RESERVED_SIZE 24
|
||||
class ihipIpcMemHandle_t
|
||||
{
|
||||
public:
|
||||
#if USE_IPC
|
||||
hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
|
||||
#endif
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
size_t psize;
|
||||
char reserved[HIP_IPC_RESERVED_SIZE];
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -30,11 +30,109 @@ THE SOFTWARE.
|
||||
#include "hip/hcc_detail/hip_texture.h"
|
||||
#include <hc_am.hpp>
|
||||
|
||||
|
||||
|
||||
// Internal HIP APIS:
|
||||
namespace hip_internal {
|
||||
|
||||
hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
|
||||
if ((dst == NULL) || (src == NULL)) {
|
||||
e= hipErrorInvalidValue;
|
||||
} else if (stream) {
|
||||
try {
|
||||
stream->locked_copyAsync(dst, src, sizeBytes, kind);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
// return 0 on success or -1 on error:
|
||||
int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
|
||||
int peerCnt=0;
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
peerCnt = crit->peerCnt();
|
||||
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
|
||||
if (peerCnt > 1) {
|
||||
|
||||
//printf ("peer self access\n");
|
||||
|
||||
// TODOD - remove me:
|
||||
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
|
||||
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
|
||||
};
|
||||
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
|
||||
if (s != HSA_STATUS_SUCCESS) {
|
||||
ret = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Allocate a new pointer with am_alloc and share with all valid peers.
|
||||
// Returns null-ptr if a memory error occurs (either allocation or sharing)
|
||||
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
|
||||
{
|
||||
|
||||
void *ptr = nullptr;
|
||||
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags);
|
||||
tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n",
|
||||
msg, ptr, sizeBytes, device->_deviceId);
|
||||
|
||||
if (ptr != nullptr) {
|
||||
int r = sharePtr(ptr, ctx, hipFlags);
|
||||
if (r != 0) {
|
||||
ptr = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
} // end namespace hip_internal
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
// Memory
|
||||
//
|
||||
//
|
||||
//
|
||||
//HIP uses several "app*" fields HC memory tracker to track state necessary for the HIP API.
|
||||
//_appId : DeviceID. For device mem, this is device where the memory is physically allocated.
|
||||
// For host or registered mem, this is the current device when the memory is allocated or registered. This device will have a GPUVM mapping for the host mem.
|
||||
//
|
||||
//_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs.
|
||||
// TODO - add more info here when available.
|
||||
//
|
||||
hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
|
||||
{
|
||||
HIP_INIT_API(attributes, ptr);
|
||||
@@ -78,6 +176,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags)
|
||||
{
|
||||
HIP_INIT_API(devicePointer, hostPointer, flags);
|
||||
@@ -102,6 +201,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(ptr, sizeBytes);
|
||||
@@ -118,37 +218,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
|
||||
if (ctx) {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
const unsigned am_flags = 0;
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
|
||||
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/);
|
||||
|
||||
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
|
||||
int peerCnt=0;
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
peerCnt = crit->peerCnt();
|
||||
tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n",
|
||||
*ptr, sizeBytes, device->_deviceId, peerCnt-1);
|
||||
if (peerCnt > 1) {
|
||||
|
||||
//printf ("peer self access\n");
|
||||
|
||||
// TODOD - remove me:
|
||||
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
|
||||
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
|
||||
};
|
||||
|
||||
hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
if (e != HSA_STATUS_SUCCESS) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
@@ -188,54 +259,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
}
|
||||
else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
if(HIP_COHERENT_HOST_ALLOC){
|
||||
// Force to allocate finedgrained system memory
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
|
||||
if(sizeBytes < 1 && (*ptr == NULL)){
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent);
|
||||
}
|
||||
tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr);
|
||||
}
|
||||
else{
|
||||
// TODO - am_alloc requires writeable __acc, perhaps could be refactored?
|
||||
// TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC.
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
|
||||
if (*ptr == NULL) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_deviceId, flags);
|
||||
// TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status.
|
||||
int peerCnt=0;
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
peerCnt = crit->peerCnt();
|
||||
if (peerCnt > 1) {
|
||||
hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
}
|
||||
}
|
||||
tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1);
|
||||
}
|
||||
}
|
||||
unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
|
||||
|
||||
*ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
|
||||
sizeBytes, ctx, amFlags, flags);
|
||||
if(sizeBytes && (*ptr == NULL)){
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
// Deprecated function:
|
||||
hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
|
||||
{
|
||||
return hipHostMalloc(ptr, sizeBytes, 0);
|
||||
}
|
||||
|
||||
|
||||
// Deprecated function:
|
||||
hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
return hipHostMalloc(ptr, sizeBytes, flags);
|
||||
};
|
||||
|
||||
|
||||
// width in bytes
|
||||
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
|
||||
{
|
||||
@@ -257,22 +310,11 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
const unsigned am_flags = 0;
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
|
||||
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
|
||||
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
|
||||
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
@@ -306,41 +348,31 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
|
||||
void ** ptr = &array[0]->data;
|
||||
|
||||
if (ctx) {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
const unsigned am_flags = 0;
|
||||
const size_t size = width*height;
|
||||
|
||||
size_t allocSize = 0;
|
||||
switch(desc->f) {
|
||||
case hipChannelFormatKindSigned:
|
||||
*ptr = hc::am_alloc(size*sizeof(int), device->_acc, am_flags);
|
||||
allocSize = size * sizeof(int);
|
||||
break;
|
||||
case hipChannelFormatKindUnsigned:
|
||||
*ptr = hc::am_alloc(size*sizeof(unsigned int), device->_acc, am_flags);
|
||||
allocSize = size * sizeof(unsigned int);
|
||||
break;
|
||||
case hipChannelFormatKindFloat:
|
||||
*ptr = hc::am_alloc(size*sizeof(float), device->_acc, am_flags);
|
||||
allocSize = size * sizeof(float);
|
||||
break;
|
||||
case hipChannelFormatKindNone:
|
||||
*ptr = hc::am_alloc(size*sizeof(size_t), device->_acc, am_flags);
|
||||
allocSize = size * sizeof(size_t);
|
||||
break;
|
||||
default:
|
||||
hip_status = hipErrorUnknown;
|
||||
break;
|
||||
}
|
||||
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
|
||||
if (size && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
|
||||
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
@@ -373,6 +405,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
|
||||
// TODO - need to fix several issues here related to P2P access, host memory fallback.
|
||||
hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(hostPtr, sizeBytes, flags);
|
||||
@@ -392,19 +426,21 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
|
||||
hip_status = hipErrorHostMemoryAlreadyRegistered;
|
||||
} else {
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if(hostPtr == NULL){
|
||||
if (hostPtr == NULL) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
//TODO-test : multi-gpu access to registered host memory.
|
||||
if (ctx) {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
if(flags == hipHostRegisterDefault || flags == hipHostRegisterPortable || flags == hipHostRegisterMapped){
|
||||
auto device = ctx->getWriteableDevice();
|
||||
std::vector<hc::accelerator>vecAcc;
|
||||
for(int i=0;i<g_deviceCnt;i++){
|
||||
vecAcc.push_back(ihipGetDevice(i)->_acc);
|
||||
}
|
||||
am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size());
|
||||
hc::am_memtracker_update(hostPtr, device->_deviceId, flags);
|
||||
|
||||
tprintf(DB_MEM, " %s registered ptr=%p\n", __func__, hostPtr);
|
||||
tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size());
|
||||
if(am_status == AM_SUCCESS){
|
||||
hip_status = hipSuccess;
|
||||
} else {
|
||||
@@ -603,6 +639,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_CMD_API(dst, src, sizeBytes);
|
||||
@@ -624,6 +661,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_CMD_API(dst, src, sizeBytes);
|
||||
@@ -645,6 +683,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_CMD_API(dst, src, sizeBytes);
|
||||
@@ -666,6 +705,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_CMD_API(dst, src, sizeBytes);
|
||||
@@ -689,32 +729,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
|
||||
|
||||
|
||||
|
||||
// Internal copy sync:
|
||||
namespace hip_internal {
|
||||
|
||||
hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
|
||||
if ((dst == NULL) || (src == NULL)) {
|
||||
e= hipErrorInvalidValue;
|
||||
} else if (stream) {
|
||||
try {
|
||||
stream->locked_copyAsync(dst, src, sizeBytes, kind);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
return e;
|
||||
}
|
||||
} // end namespace hip_internal
|
||||
|
||||
|
||||
hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
@@ -990,6 +1004,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemGetInfo (size_t *free, size_t *total)
|
||||
{
|
||||
HIP_INIT_API(free, total);
|
||||
@@ -1024,6 +1039,28 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemPtrGetInfo(void *ptr, size_t *size)
|
||||
{
|
||||
HIP_INIT_API(ptr, size);
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
if(ptr != nullptr && size != nullptr){
|
||||
hc::accelerator acc;
|
||||
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
|
||||
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
|
||||
if(status == AM_SUCCESS){
|
||||
*size = amPointerInfo._sizeBytes;
|
||||
}else{
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
}else{
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipFree(void* ptr)
|
||||
{
|
||||
HIP_INIT_API(ptr);
|
||||
@@ -1051,6 +1088,7 @@ hipError_t hipFree(void* ptr)
|
||||
return ihipLogStatus(hipStatus);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipHostFree(void* ptr)
|
||||
{
|
||||
HIP_INIT_API(ptr);
|
||||
@@ -1122,7 +1160,7 @@ hipError_t hipMemGetAddressRange ( hipDeviceptr_t* pbase, size_t* psize, hipDevi
|
||||
}
|
||||
else
|
||||
hipStatus = hipErrorInvalidDevicePointer;
|
||||
return hipStatus;
|
||||
return ihipLogStatus(hipStatus);
|
||||
}
|
||||
|
||||
|
||||
@@ -1141,25 +1179,25 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
|
||||
}
|
||||
else
|
||||
hipStatus = hipErrorInvalidResourceHandle;
|
||||
|
||||
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
|
||||
// Save the size of the pointer to hipIpcMemHandle
|
||||
(*handle)->psize = psize;
|
||||
iHandle->psize = psize;
|
||||
|
||||
#if USE_IPC
|
||||
// Create HSA ipc memory
|
||||
hsa_status_t hsa_status =
|
||||
hsa_amd_ipc_memory_create(devPtr, psize, &(*handle)->ipc_handle);
|
||||
hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
|
||||
if(hsa_status!= HSA_STATUS_SUCCESS)
|
||||
hipStatus = hipErrorMemoryAllocation;
|
||||
#else
|
||||
hipStatus = hipErrorRuntimeOther;
|
||||
#endif
|
||||
|
||||
return hipStatus;
|
||||
return ihipLogStatus(hipStatus);
|
||||
}
|
||||
|
||||
hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){
|
||||
// HIP_INIT_API ( devPtr, handle.handle , flags);
|
||||
HIP_INIT_API ( devPtr, &handle , flags);
|
||||
hipError_t hipStatus = hipSuccess;
|
||||
|
||||
#if USE_IPC
|
||||
@@ -1169,15 +1207,16 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
|
||||
if(!agent)
|
||||
return hipErrorInvalidResourceHandle;
|
||||
|
||||
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
|
||||
//Attach ipc memory
|
||||
hsa_status_t hsa_status =
|
||||
hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr);
|
||||
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
|
||||
if(hsa_status != HSA_STATUS_SUCCESS)
|
||||
hipStatus = hipErrorMapBufferObjectFailed;
|
||||
#else
|
||||
hipStatus = hipErrorRuntimeOther;
|
||||
#endif
|
||||
return hipStatus;
|
||||
return ihipLogStatus(hipStatus);
|
||||
}
|
||||
|
||||
hipError_t hipIpcCloseMemHandle(void *devPtr){
|
||||
@@ -1192,7 +1231,7 @@ hipError_t hipIpcCloseMemHandle(void *devPtr){
|
||||
#else
|
||||
hipStatus = hipErrorRuntimeOther;
|
||||
#endif
|
||||
return hipStatus;
|
||||
return ihipLogStatus(hipStatus);
|
||||
}
|
||||
|
||||
// hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){
|
||||
|
||||
@@ -218,31 +218,33 @@ hipError_t hipModuleUnload(hipModule_t hmod)
|
||||
{
|
||||
ret = hipErrorInvalidValue;
|
||||
}
|
||||
for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
|
||||
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
|
||||
delete *f;
|
||||
}
|
||||
delete hmod;
|
||||
return ihipLogStatus(ret);
|
||||
}
|
||||
|
||||
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name){
|
||||
|
||||
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name)
|
||||
{
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
hipError_t ret = hipSuccess;
|
||||
|
||||
if(name == nullptr){
|
||||
if (name == nullptr){
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if(ctx == nullptr){
|
||||
if (ctx == nullptr){
|
||||
ret = hipErrorInvalidContext;
|
||||
|
||||
}else{
|
||||
} else {
|
||||
std::string str(name);
|
||||
for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
|
||||
if((*f)->_name == str) {
|
||||
*func = *f;
|
||||
}
|
||||
return ret;
|
||||
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
|
||||
if((*f)->_name == str) {
|
||||
*func = *f;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
ihipModuleSymbol_t *sym = new ihipModuleSymbol_t;
|
||||
int deviceId = ctx->getDevice()->_deviceId;
|
||||
|
||||
@@ -46,7 +46,7 @@ __device__ float asinhf(float x)
|
||||
}
|
||||
__device__ float atan2f(float y, float x)
|
||||
{
|
||||
return hc::precise_math::atan2f(x, y);
|
||||
return hc::precise_math::atan2f(y, x);
|
||||
}
|
||||
__device__ float atanf(float x)
|
||||
{
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -39,22 +39,67 @@ int main(){
|
||||
const size_t size = N * sizeof(float);
|
||||
A = (float*)malloc(size);
|
||||
HIPCHECK(hipHostRegister(A, size, 0));
|
||||
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
A[i] = float(1);
|
||||
}
|
||||
|
||||
|
||||
for(int i=0;i<num_devices;i++){
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
|
||||
}
|
||||
|
||||
// Reference the registered device pointer Ad from inside the kernel:
|
||||
for(int i=0;i<num_devices;i++){
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/512), dim3(512), 0, 0, Ad[i]);
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
hipLaunchKernel(Inc, dim3(N/512), dim3(512), 0, 0, Ad[i]);
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
HIPASSERT(A[10] == 1.0f + float(num_devices));
|
||||
|
||||
|
||||
{
|
||||
// Sensitize HIP bug if device does not match where the memory was registered.
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
|
||||
|
||||
|
||||
// Copy to B, this should be optimal pinned malloc copy:
|
||||
// Note we are using the host pointer here:
|
||||
float *Bh, *Bd;
|
||||
Bh = (float*)malloc(size);
|
||||
HIPCHECK(hipMalloc(&Bd, size));
|
||||
HIPCHECK(hipMemset(Bd, 13.0f, size));
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
A[i] = float(i);
|
||||
Bh[i] = 0.0f;
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemcpy(Bd, A, size, hipMemcpyHostToDevice));
|
||||
|
||||
HIPCHECK(hipMemcpy(Bh, Bd, size, hipMemcpyDeviceToHost));
|
||||
|
||||
#if 0
|
||||
//TODO - disable check HCC patch for registered/locked memory usin device pointers is merged.
|
||||
for(int i=0;i<N;i++){
|
||||
if (Bh[i] != A[i]) {
|
||||
printf ("mismatch at Bh[%d]=%f, A[%d]=%f\n", i, Bh[i], i, A[i]);
|
||||
failed("mismatch");
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Make sure the copy worked
|
||||
}
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipHostUnregister(A));
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include"test_common.h"
|
||||
|
||||
struct {
|
||||
float a;
|
||||
int b;
|
||||
void *c;
|
||||
} Struct ;
|
||||
|
||||
int main(){
|
||||
int *iPtr;
|
||||
float *fPtr;
|
||||
struct Struct *sPtr;
|
||||
size_t sSetSize = 1024, sGetSize;
|
||||
hipMalloc(&iPtr, sSetSize);
|
||||
hipMalloc(&fPtr, sSetSize);
|
||||
hipMalloc(&sPtr, sSetSize);
|
||||
hipMemPtrGetInfo(iPtr, &sGetSize);
|
||||
assert(sGetSize == sSetSize);
|
||||
hipMemPtrGetInfo(fPtr, &sGetSize);
|
||||
assert(sGetSize == sSetSize);
|
||||
hipMemPtrGetInfo(sPtr, &sGetSize);
|
||||
assert(sGetSize == sSetSize);
|
||||
passed();
|
||||
}
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren