Merge branch 'amd-develop' into amd-master

Change-Id: I8921e67e352e35e4c496e78a797fb309279ab7d0


[ROCm/clr commit: 62870fdc39]
Dieser Commit ist enthalten in:
Maneesh Gupta
2017-03-14 15:57:38 +05:30
Commit b054a30dec
21 geänderte Dateien mit 625 neuen und 306 gelöschten Zeilen
+1 -1
Datei anzeigen
@@ -189,7 +189,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900")
add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
target_link_libraries(hip_hcc PRIVATE hc_am)
add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})
+66 -33
Datei anzeigen
@@ -23,8 +23,8 @@ use File::Basename;
# HSA_PATH : Path to HSA dir (default /opt/rocm/hsa). Used on AMD platforms only.
if(scalar @ARGV == 0){
print "No Arguments passed, exiting ...\n";
exit(-1);
print "No Arguments passed, exiting ...\n";
exit(-1);
}
#---
@@ -74,6 +74,7 @@ $target_gfx701 = 0;
$target_gfx801 = 0;
$target_gfx802 = 0;
$target_gfx803 = 0;
$target_gfx900 = 0;
if ($HIP_PLATFORM eq "hcc") {
$HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa";
@@ -189,18 +190,18 @@ if ($verbose & 0x4) {
# Handle code object generation
my $ISACMD="";
if($HIP_PLATFORM eq "hcc"){
$ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
if($ARGV[0] eq "--genco"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
$ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
if($ARGV[0] eq "--genco"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
}
if(($HIP_PLATFORM eq "hcc")){
@@ -210,18 +211,18 @@ if(($HIP_PLATFORM eq "hcc")){
}
if($HIP_PLATFORM eq "nvcc"){
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
if($ARGV[0] eq "--genco"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
if($ARGV[0] eq "--genco"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
}
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
@@ -247,20 +248,25 @@ foreach $arg (@ARGV)
}
if($arg eq '--amdgpu-target=gfx701')
{
$target_gfx701 = 1;
$target_gfx701 = 1;
}
if($arg eq '--amdgpu-target=gfx801')
{
$target_gfx801 = 1;
$target_gfx801 = 1;
}
if($arg eq '--amdgpu-target=gfx802')
{
$target_gfx802 = 1;
$target_gfx802 = 1;
}
if($arg eq '--amdgpu-target=gfx803')
{
$target_gfx803 = 1;
$target_gfx803 = 1;
}
if($arg eq '--amdgpu-target=gfx900')
{
$target_gfx900 = 1;
}
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
{
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
@@ -320,6 +326,33 @@ foreach $arg (@ARGV)
}
$toolArgs .= " $arg" unless $swallowArg;
}
foreach my $target (split(/,/, $ENV{HCC_AMDGPU_TARGET}))
{
if($target eq 'gfx701')
{
$target_gfx701 = 1;
}
if($target eq 'gfx801')
{
$target_gfx801 = 1;
}
if($target eq 'gfx802')
{
$target_gfx802 = 1;
}
if($target eq 'gfx803')
{
$target_gfx803 = 1;
}
if($target eq 'gfx900')
{
$target_gfx900 = 1;
}
}
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0 and $target_gfx900 eq 0)
{
$target_gfx803 = 1;
}
if($HIP_PLATFORM eq "hcc"){
@@ -343,12 +376,10 @@ if($HIP_PLATFORM eq "hcc"){
$HIPCXXFLAGS .= " -D__HIP_ARCH_GFX803__=1 ";
$ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
}
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0)
{
$HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803";
$ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
if ($target_gfx900 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx900";
$HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 ";
}
}
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
@@ -407,3 +438,5 @@ if ($runCmd) {
}
system ("$CMD") and die ();
}
# vim: ts=4:sw=4:expandtab:smartindent
@@ -166,10 +166,10 @@ Both nvcc and hcc make two passes over the code: one for host code and one for d
```
// #ifdef __CUDA_ARCH__
#ifdef __HIP_DEVICE_COMPILE__ && (__HIP_DEVICE_COMPILE__ == 1)
#if __HIP_DEVICE_COMPILE__
```
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doesn’t represent the feature capability of the target device.
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn’t represent the feature capability of the target device.
### Compiler Defines: Summary
@@ -178,7 +178,7 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doe
|HIP-related defines:|
|`__HIP_PLATFORM_HCC___`| Defined | Undefined | Defined if targeting hcc platform; undefined otherwise |
|`__HIP_PLATFORM_NVCC___`| Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise |
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; 0 if compiling for host |1 if compiling for device; 0 if compiling for host | Undefined
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined
|`__HIPCC__` | Defined | Defined | Undefined
|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
|nvcc-related defines:|
@@ -28,7 +28,7 @@ THE SOFTWARE.
#if __cplusplus
#define COMPLEX_ADD_OP_OVERLOAD(type) \
__device__ __host__ static type operator + (const type& lhs, const type& rhs) { \
__device__ __host__ static inline type operator + (const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x + rhs.x ; \
ret.y = lhs.y + rhs.y ; \
@@ -36,7 +36,7 @@ __device__ __host__ static type operator + (const type& lhs, const type& rhs) {
}
#define COMPLEX_SUB_OP_OVERLOAD(type) \
__device__ __host__ static type operator - (const type& lhs, const type& rhs) { \
__device__ __host__ static inline type operator - (const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x - rhs.x; \
ret.y = lhs.y - rhs.y; \
@@ -44,7 +44,7 @@ __device__ __host__ static type operator - (const type& lhs, const type& rhs) {
}
#define COMPLEX_MUL_OP_OVERLOAD(type) \
__device__ __host__ static type operator * (const type& lhs, const type& rhs) { \
__device__ __host__ static inline type operator * (const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
@@ -52,7 +52,7 @@ __device__ __host__ static type operator * (const type& lhs, const type& rhs) {
}
#define COMPLEX_DIV_OP_OVERLOAD(type) \
__device__ __host__ static type operator / (const type& lhs, const type& rhs) { \
__device__ __host__ static inline type operator / (const type& lhs, const type& rhs) { \
type ret; \
ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
@@ -88,7 +88,7 @@ __device__ __host__ static inline type& operator /= (type& lhs, const type& rhs)
}
#define COMPLEX_SCALAR_PRODUCT(type, type1) \
__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \
type ret; \
ret.x = lhs.x * rhs; \
ret.y = lhs.y * rhs; \
@@ -226,6 +226,8 @@ __device__ int __all( int input);
__device__ int __any( int input);
__device__ unsigned long long int __ballot( int input);
#if __HIP_ARCH_GFX701__ == 0
// warp shuffle functions
#ifdef __cplusplus
__device__ int __shfl(int input, int lane, int width=warpSize);
@@ -247,6 +249,18 @@ __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
__device__ float __shfl_xor(float input, int lane_mask, int width);
#endif
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
__device__ float __hip_ds_bpermutef(int index, float src);
__device__ unsigned __hip_ds_permute(int index, unsigned src);
__device__ float __hip_ds_permutef(int index, float src);
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
__device__ float __hip_ds_swizzlef(float src, int pattern);
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
#endif
__host__ __device__ int min(int arg1, int arg2);
__host__ __device__ int max(int arg1, int arg2);
@@ -321,16 +335,6 @@ __device__ static inline void __threadfence(void) {
//__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
__device__ void __threadfence_system(void) ;
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
__device__ float __hip_ds_bpermutef(int index, float src);
__device__ unsigned __hip_ds_permute(int index, unsigned src);
__device__ float __hip_ds_permutef(int index, float src);
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
__device__ float __hip_ds_swizzlef(float src, int pattern);
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
// doxygen end Fence Fence
/**
* @}
@@ -62,7 +62,12 @@ typedef struct ihipStream_t *hipStream_t;
#define hipIpcMemLazyEnablePeerAccess 0
typedef struct ihipIpcMemHandle_t *hipIpcMemHandle_t;
#define HIP_IPC_HANDLE_SIZE 64
typedef struct hipIpcMemHandle_st
{
char reserved[HIP_IPC_HANDLE_SIZE];
}hipIpcMemHandle_t;
//TODO: IPC event handle currently unsupported
struct ihipIpcEventHandle_t;
@@ -853,6 +858,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
* @param[out] ptr Pointer to the allocated memory
* @param[in] size Requested memory size
*
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
*
* @return #hipSuccess
*
* @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc
@@ -865,6 +872,8 @@ hipError_t hipMalloc(void** ptr, size_t size) ;
* @param[out] ptr Pointer to the allocated host pinned memory
* @param[in] size Requested memory size
*
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
*
* @return #hipSuccess, #hipErrorMemoryAllocation
*
* @deprecated use hipHostMalloc() instead
@@ -878,6 +887,8 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use
* @param[in] size Requested memory size
* @param[in] flags Type of host memory allocation
*
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
*
* @return #hipSuccess, #hipErrorMemoryAllocation
*
* @see hipSetDeviceFlags, hipHostFree
@@ -891,6 +902,8 @@ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ;
* @param[in] size Requested memory size
* @param[in] flags Type of host memory allocation
*
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
*
* @return #hipSuccess, #hipErrorMemoryAllocation
*
* @deprecated use hipHostMalloc() instead
@@ -975,6 +988,9 @@ hipError_t hipHostUnregister(void* hostPtr) ;
* @param[out] pitch Pitch for allocation (in bytes)
* @param[in] width Requested pitched allocation width (in bytes)
* @param[in] height Requested pitched allocation height
*
* If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
*
* @return Error code
*
* @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc
@@ -1236,6 +1252,9 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st
hipError_t hipMemGetInfo (size_t * free, size_t * total) ;
hipError_t hipMemPtrGetInfo(void *ptr, size_t *size);
/**
* @brief Allocate an array on the device.
*
@@ -1260,7 +1260,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
}
#define DECLOP_1VAR_POSTOP(type, op) \
__device__ __host__ static inline type operator op (type& val, int i) { \
__device__ __host__ static inline type operator op (type& val, int) { \
type ret; \
ret.x = val.x; \
val.x op; \
@@ -1326,7 +1326,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
}
#define DECLOP_2VAR_POSTOP(type, op) \
__device__ __host__ static inline type operator op (type& val, int i) { \
__device__ __host__ static inline type operator op (type& val, int) { \
type ret; \
ret.x = val.x; \
ret.y = val.y; \
@@ -1337,7 +1337,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
#define DECLOP_2VAR_COMP(type, op) \
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
return lhs.x op rhs.x && lhs.y op rhs.y; \
return (lhs.x op rhs.x) && (lhs.y op rhs.y); \
}
#define DECLOP_2VAR_1IN_1OUT(type, op) \
@@ -1350,7 +1350,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \
__device__ __host__ static inline bool operator op (type &rhs) { \
return op rhs.x && op rhs.y; \
return (op rhs.x) && (op rhs.y); \
}
@@ -1401,7 +1401,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
}
#define DECLOP_3VAR_POSTOP(type, op) \
__device__ __host__ static inline type operator op (type& val, int i) { \
__device__ __host__ static inline type operator op (type& val, int) { \
type ret; \
ret.x = val.x; \
ret.y = val.y; \
@@ -1414,7 +1414,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
#define DECLOP_3VAR_COMP(type, op) \
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \
return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \
}
#define DECLOP_3VAR_1IN_1OUT(type, op) \
@@ -1428,7 +1428,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \
__device__ __host__ static inline bool operator op (type &rhs) { \
return op rhs.x && op rhs.y && op rhs.z; \
return (op rhs.x) && (op rhs.y) && (op rhs.z); \
}
@@ -1484,7 +1484,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
}
#define DECLOP_4VAR_POSTOP(type, op) \
__device__ __host__ static inline type operator op (type& val, int i) { \
__device__ __host__ static inline type operator op (type& val, int) { \
type ret; \
ret.x = val.x; \
ret.y = val.y; \
@@ -1499,7 +1499,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \
#define DECLOP_4VAR_COMP(type, op) \
__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \
return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \
}
#define DECLOP_4VAR_1IN_1OUT(type, op) \
@@ -1514,7 +1514,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \
#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \
__device__ __host__ static inline bool operator op (type &rhs) { \
return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \
return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \
}
+4 -13
Datei anzeigen
@@ -27,13 +27,6 @@ THE SOFTWARE.
// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
#if defined(__HCC__)
#define __HIP_PLATFORM_HCC__
#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
#define __HIP_DEVICE_COMPILE__ 1
#else
#define __HIP_DEVICE_COMPILE__ 0
#endif
#endif //__HCC__
// Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
@@ -43,14 +36,12 @@ THE SOFTWARE.
#define __HIPCC__
#endif
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)
#define __HIP_DEVICE_COMPILE__ 1
#else
#define __HIP_DEVICE_COMPILE__ 0
#endif
#endif //__NVCC__
// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
#define __HIP_DEVICE_COMPILE__ 1
#endif
#if __HIP_DEVICE_COMPILE__ == 0
// 32-bit Atomics
@@ -106,6 +106,7 @@ typedef struct hipDeviceProp_t {
size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor.
int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not.
int canMapHostMemory; ///< Check whether HIP can map host memory
int gcnArch; ///< AMD GCN Arch Value. Eg: 803, 701
} hipDeviceProp_t;
@@ -32,7 +32,7 @@ THE SOFTWARE.
}\
}
/*
/*
* Square each element in the array A and write to array C.
*/
template <typename T>
@@ -58,16 +58,18 @@ int main(int argc, char *argv[])
hipDeviceProp_t props;
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
#ifdef __HIP_PLATFORM_HCC__
printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
#endif
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
A_h = (float*)malloc(Nbytes);
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
C_h = (float*)malloc(Nbytes);
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
// Fill with Phi + i
for (size_t i=0; i<N; i++)
for (size_t i=0; i<N; i++)
{
A_h[i] = 1.618f + i;
A_h[i] = 1.618f + i;
}
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
@@ -6,9 +6,12 @@
#include "ResultDatabase.h"
enum MallocMode {MallocPinned, MallocUnpinned, MallocRegistered};
// Cmdline parms:
bool p_verbose = false;
bool p_pinned = true;
MallocMode p_malloc_mode = MallocPinned;
int p_numa_ctl = -1;
int p_iterations = 10;
int p_beatsperiteration=1;
int p_device = 0;
@@ -21,7 +24,7 @@ bool p_h2d = true;
bool p_d2h = true;
bool p_bidir = true;
//#define NO_CHECK
#define CHECK_HIP_ERROR() \
@@ -36,6 +39,14 @@ bool p_bidir = true;
}
std::string mallocModeString(int mallocMode) {
switch (mallocMode) {
case MallocPinned : return "pinned";
case MallocUnpinned: return "unpinned";
case MallocRegistered: return "registered";
default: return "mallocmode-UNKNOWN";
};
};
// ****************************************************************************
int sizeToBytes(int size) {
@@ -106,7 +117,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem = NULL;
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
while (hipGetLastError() != hipSuccess)
@@ -116,20 +127,33 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
}
}
else
else if (p_malloc_mode == MallocUnpinned)
{
if (p_alignedhost) {
hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float));
} else {
hostMem = new float[numMaxFloats];
}
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i = 0; i < numMaxFloats; i++)
@@ -146,7 +170,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -199,8 +223,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
if (p_onesize) {
break;
@@ -212,6 +236,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
}
#ifndef NO_CHECK
// Check. First reset the host memory, then copy-back result. Then compare against original ref value.
for (int i = 0; i < numMaxFloats; i++)
{
@@ -225,24 +251,36 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]);
}
}
#endif
// Cleanup
hipFree((void*)device);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
if (p_alignedhost) {
delete[] hostMem;
} else {
free(hostMem);
}
break;
case MallocRegistered:
hipHostUnregister(hostMem);
CHECK_HIP_ERROR();
free(hostMem);
break;
default:
assert(0);
}
hipEventDestroy(start);
hipEventDestroy(stop);
}
@@ -257,38 +295,56 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem1;
float *hostMem2;
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
hipError_t err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
hipError_t err2 = hipGetLastError();
while (err1 != hipSuccess || err2 != hipSuccess)
{
// free the first buffer if only the second failed
if (err1 == hipSuccess)
hipHostFree((void*)hostMem1);
while (err1 != hipSuccess || err2 != hipSuccess)
{
// free the first buffer if only the second failed
if (err1 == hipSuccess)
hipHostFree((void*)hostMem1);
// drop the size and try again
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
err2 = hipGetLastError();
}
}
else
// drop the size and try again
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
err2 = hipGetLastError();
}
}
else if (p_malloc_mode == MallocUnpinned)
{
hostMem1 = new float[numMaxFloats];
hostMem2 = new float[numMaxFloats];
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem1 = (float*)malloc(numMaxFloats*sizeof(float));
hostMem2 = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i=0; i<numMaxFloats; i++)
hostMem1[i] = i % 77;
@@ -301,7 +357,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -358,8 +414,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("D2H_Bandwidth") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "ms", t);
if (p_onesize) {
break;
}
@@ -381,20 +437,31 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
// Cleanup
hipFree((void*)device);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem1);
CHECK_HIP_ERROR();
hipHostFree((void*)hostMem2);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
delete[] hostMem1;
delete[] hostMem2;
hipEventDestroy(start);
hipEventDestroy(stop);
break;
case MallocRegistered:
hipHostUnregister(hostMem1);
CHECK_HIP_ERROR();
free(hostMem1);
hipHostUnregister(hostMem2);
free(hostMem2);
break;
default:
assert(0);
}
hipEventDestroy(start);
hipEventDestroy(stop);
}
@@ -409,7 +476,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem[2] = {NULL, NULL};
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
while (1)
{
@@ -424,18 +491,34 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
}
}
}
else
else if (p_malloc_mode == MallocUnpinned)
{
hostMem[0] = new float[numMaxFloats];
hostMem[1] = new float[numMaxFloats];
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem[0] = (float*)malloc(numMaxFloats*sizeof(float));
hostMem[1] = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem[0], numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
hipHostRegister(hostMem[1], numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i = 0; i < numMaxFloats; i++)
{
@@ -459,7 +542,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -512,8 +595,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
}
}
@@ -521,17 +604,27 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
hipFree((void*)deviceMem[0]);
hipFree((void*)deviceMem[1]);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem[0]);
hipHostFree((void*)hostMem[1]);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
delete[] hostMem[0];
delete[] hostMem[1];
}
break;
case MallocRegistered:
for (int i=0; i<2; i++) {
hipHostUnregister(hostMem[i]);
CHECK_HIP_ERROR();
free(hostMem[i]);
}
break;
default:
assert(0);
};
hipEventDestroy(start);
hipEventDestroy(stop);
hipStreamDestroy(stream[0]);
@@ -557,7 +650,7 @@ void printConfig() {
hipDeviceProp_t props;
hipGetDeviceProperties(&props, p_device);
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz Pinned=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, p_pinned ? "YES" : "NO");
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz MallocMode=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, mallocModeString(p_malloc_mode).c_str());
}
void help() {
@@ -601,7 +694,9 @@ int parseStandardArguments(int argc, char *argv[])
failed("Bad onesize argument");
}
} else if (!strcmp(arg, "--unpinned")) {
p_pinned = 0;
p_malloc_mode = MallocUnpinned;
} else if (!strcmp(arg, "--registered")) {
p_malloc_mode = MallocRegistered;
} else if (!strcmp(arg, "--h2d")) {
p_h2d = true;
p_d2h = false;
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
$(error gfx701 is not a supported device for this sample)
endif
HIPCC=$(HIP_PATH)/bin/hipcc
TARGET=hcc
@@ -22,7 +26,7 @@ CXX=$(HIPCC)
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@
$(HIPCC) $(OBJECTS) -o $@
test: $(EXECUTABLE)
@@ -33,4 +37,3 @@ clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
rm -f $(HIP_PATH)/src/*.o
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
$(error gfx701 is not a supported device for this sample)
endif
HIPCC=$(HIP_PATH)/bin/hipcc
TARGET=hcc
+45 -16
Datei anzeigen
@@ -758,11 +758,24 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1;
// Get agent name
#if HIP_USE_PRODUCT_NAME
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name));
#else
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &(prop->name));
#endif
char archName[256];
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);
if(strcmp(archName,"gfx701")==0){
prop->gcnArch = 701;
}
if(strcmp(archName,"gfx801")==0){
prop->gcnArch = 801;
}
if(strcmp(archName,"gfx802")==0){
prop->gcnArch = 802;
}
if(strcmp(archName,"gfx803")==0){
prop->gcnArch = 803;
}
DeviceErrorCheck(err);
// Get agent node
@@ -1790,6 +1803,20 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
}
void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo)
{
tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n",
tag, ptr,
ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes,
ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged);
}
// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation.
// What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i
// The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur.
// Need to track Portable flag correctly or use new RT functionality to query the peer status for the pointer.
//
// TODO - remove kind parm from here or use it below?
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
{
@@ -1806,6 +1833,16 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
// Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here:
if (!dstTracked) {
assert (dstPtrInfo._sizeBytes == 0);
}
if (!srcTracked) {
assert (srcPtrInfo._sizeBytes == 0);
}
hc::hcCommandKind hcCopyDir;
ihipCtx_t *copyDevice;
bool forceUnpinnedCopy;
@@ -1818,12 +1855,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo);
printPointerInfo(DB_COPY, " src", src, srcPtrInfo);
this->ensureHaveQueue(crit);
@@ -1908,12 +1941,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
tprintf (DB_COPY, " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
tprintf (DB_COPY, " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo);
printPointerInfo(DB_COPY, " src", src, srcPtrInfo);
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
// If both pointers are not tracked, we need to fall back to a sync copy.
+3 -3
Datei anzeigen
@@ -36,7 +36,7 @@ THE SOFTWARE.
#error("This version of HIP requires a newer version of HCC.");
#endif
#define USE_IPC 0
#define USE_IPC 1
//---
// Environment variables:
@@ -326,15 +326,15 @@ const hipStream_t hipStreamNull = 0x0;
/**
* HIP IPC Handle Size
*/
#define HIP_IPC_HANDLE_SIZE 64
#define HIP_IPC_RESERVED_SIZE 24
class ihipIpcMemHandle_t
{
public:
#if USE_IPC
hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
#endif
char reserved[HIP_IPC_HANDLE_SIZE];
size_t psize;
char reserved[HIP_IPC_RESERVED_SIZE];
};
+167 -128
Datei anzeigen
@@ -30,11 +30,109 @@ THE SOFTWARE.
#include "hip/hcc_detail/hip_texture.h"
#include <hc_am.hpp>
// Internal HIP APIS:
namespace hip_internal {
hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
hipError_t e = hipSuccess;
stream = ihipSyncAndResolveStream(stream);
if ((dst == NULL) || (src == NULL)) {
e= hipErrorInvalidValue;
} else if (stream) {
try {
stream->locked_copyAsync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
e = ex._code;
}
} else {
e = hipErrorInvalidValue;
}
return e;
}
// return 0 on success or -1 on error:
int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
{
int ret = 0;
auto device = ctx->getWriteableDevice();
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
int peerCnt=0;
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
// the peerCnt always stores self so make sure the trace actually
peerCnt = crit->peerCnt();
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
if (peerCnt > 1) {
//printf ("peer self access\n");
// TODOD - remove me:
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
};
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
if (s != HSA_STATUS_SUCCESS) {
ret = -1;
}
}
}
return ret;
}
// Allocate a new pointer with am_alloc and share with all valid peers.
// Returns null-ptr if a memory error occurs (either allocation or sharing)
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
{
void *ptr = nullptr;
auto device = ctx->getWriteableDevice();
ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags);
tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n",
msg, ptr, sizeBytes, device->_deviceId);
if (ptr != nullptr) {
int r = sharePtr(ptr, ctx, hipFlags);
if (r != 0) {
ptr = nullptr;
}
}
return ptr;
}
} // end namespace hip_internal
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Memory
//
//
//
//HIP uses several "app*" fields HC memory tracker to track state necessary for the HIP API.
//_appId : DeviceID. For device mem, this is device where the memory is physically allocated.
// For host or registered mem, this is the current device when the memory is allocated or registered. This device will have a GPUVM mapping for the host mem.
//
//_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs.
// TODO - add more info here when available.
//
hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
{
HIP_INIT_API(attributes, ptr);
@@ -78,6 +176,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
return ihipLogStatus(e);
}
hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags)
{
HIP_INIT_API(devicePointer, hostPointer, flags);
@@ -102,6 +201,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
return ihipLogStatus(e);
}
hipError_t hipMalloc(void** ptr, size_t sizeBytes)
{
HIP_INIT_API(ptr, sizeBytes);
@@ -118,37 +218,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
if (ctx) {
auto device = ctx->getWriteableDevice();
const unsigned am_flags = 0;
*ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
} else {
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
int peerCnt=0;
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
// the peerCnt always stores self so make sure the trace actually
peerCnt = crit->peerCnt();
tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n",
*ptr, sizeBytes, device->_deviceId, peerCnt-1);
if (peerCnt > 1) {
//printf ("peer self access\n");
// TODOD - remove me:
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
};
hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
if (e != HSA_STATUS_SUCCESS) {
hip_status = hipErrorMemoryAllocation;
}
}
}
}
} else {
hip_status = hipErrorMemoryAllocation;
}
@@ -188,54 +259,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
}
else {
auto device = ctx->getWriteableDevice();
if(HIP_COHERENT_HOST_ALLOC){
// Force to allocate finedgrained system memory
*ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
if(sizeBytes < 1 && (*ptr == NULL)){
hip_status = hipErrorMemoryAllocation;
} else {
hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent);
}
tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr);
}
else{
// TODO - am_alloc requires writeable __acc, perhaps could be refactored?
// TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC.
*ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
if (*ptr == NULL) {
hip_status = hipErrorMemoryAllocation;
} else {
hc::am_memtracker_update(*ptr, device->_deviceId, flags);
// TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status.
int peerCnt=0;
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
peerCnt = crit->peerCnt();
if (peerCnt > 1) {
hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
}
}
tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1);
}
}
unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
*ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
sizeBytes, ctx, amFlags, flags);
if(sizeBytes && (*ptr == NULL)){
hip_status = hipErrorMemoryAllocation;
}
}
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
return ihipLogStatus(hip_status);
}
// Deprecated function:
hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
{
return hipHostMalloc(ptr, sizeBytes, 0);
}
// Deprecated function:
hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
{
return hipHostMalloc(ptr, sizeBytes, flags);
};
// width in bytes
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
{
@@ -257,22 +310,11 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
auto device = ctx->getWriteableDevice();
const unsigned am_flags = 0;
*ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
} else {
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
if (hsa_status != HSA_STATUS_SUCCESS) {
hip_status = hipErrorMemoryAllocation;
}
}
}
}
}
} else {
hip_status = hipErrorMemoryAllocation;
}
@@ -306,41 +348,31 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
void ** ptr = &array[0]->data;
if (ctx) {
auto device = ctx->getWriteableDevice();
const unsigned am_flags = 0;
const size_t size = width*height;
size_t allocSize = 0;
switch(desc->f) {
case hipChannelFormatKindSigned:
*ptr = hc::am_alloc(size*sizeof(int), device->_acc, am_flags);
allocSize = size * sizeof(int);
break;
case hipChannelFormatKindUnsigned:
*ptr = hc::am_alloc(size*sizeof(unsigned int), device->_acc, am_flags);
allocSize = size * sizeof(unsigned int);
break;
case hipChannelFormatKindFloat:
*ptr = hc::am_alloc(size*sizeof(float), device->_acc, am_flags);
allocSize = size * sizeof(float);
break;
case hipChannelFormatKindNone:
*ptr = hc::am_alloc(size*sizeof(size_t), device->_acc, am_flags);
allocSize = size * sizeof(size_t);
break;
default:
hip_status = hipErrorUnknown;
break;
}
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
if (size && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
} else {
hc::am_memtracker_update(*ptr, device->_deviceId, 0);
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
if (hsa_status != HSA_STATUS_SUCCESS) {
hip_status = hipErrorMemoryAllocation;
}
}
}
}
}
} else {
hip_status = hipErrorMemoryAllocation;
@@ -373,6 +405,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
return ihipLogStatus(hip_status);
}
// TODO - need to fix several issues here related to P2P access, host memory fallback.
hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
{
HIP_INIT_API(hostPtr, sizeBytes, flags);
@@ -392,19 +426,21 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
hip_status = hipErrorHostMemoryAlreadyRegistered;
} else {
auto ctx = ihipGetTlsDefaultCtx();
if(hostPtr == NULL){
if (hostPtr == NULL) {
return ihipLogStatus(hipErrorInvalidValue);
}
//TODO-test : multi-gpu access to registered host memory.
if (ctx) {
auto device = ctx->getWriteableDevice();
if(flags == hipHostRegisterDefault || flags == hipHostRegisterPortable || flags == hipHostRegisterMapped){
auto device = ctx->getWriteableDevice();
std::vector<hc::accelerator>vecAcc;
for(int i=0;i<g_deviceCnt;i++){
vecAcc.push_back(ihipGetDevice(i)->_acc);
}
am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size());
hc::am_memtracker_update(hostPtr, device->_deviceId, flags);
tprintf(DB_MEM, " %s registered ptr=%p\n", __func__, hostPtr);
tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size());
if(am_status == AM_SUCCESS){
hip_status = hipSuccess;
} else {
@@ -603,6 +639,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
return ihipLogStatus(e);
}
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
{
HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -624,6 +661,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
return ihipLogStatus(e);
}
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
{
HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -645,6 +683,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
return ihipLogStatus(e);
}
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
{
HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -666,6 +705,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
return ihipLogStatus(e);
}
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
{
HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -689,32 +729,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
// Internal copy sync:
namespace hip_internal {
hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
hipError_t e = hipSuccess;
stream = ihipSyncAndResolveStream(stream);
if ((dst == NULL) || (src == NULL)) {
e= hipErrorInvalidValue;
} else if (stream) {
try {
stream->locked_copyAsync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
e = ex._code;
}
} else {
e = hipErrorInvalidValue;
}
return e;
}
} // end namespace hip_internal
hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
@@ -990,6 +1004,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
return ihipLogStatus(e);
}
hipError_t hipMemGetInfo (size_t *free, size_t *total)
{
HIP_INIT_API(free, total);
@@ -1024,6 +1039,28 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total)
return ihipLogStatus(e);
}
hipError_t hipMemPtrGetInfo(void *ptr, size_t *size)
{
HIP_INIT_API(ptr, size);
hipError_t e = hipSuccess;
if(ptr != nullptr && size != nullptr){
hc::accelerator acc;
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
if(status == AM_SUCCESS){
*size = amPointerInfo._sizeBytes;
}else{
e = hipErrorInvalidValue;
}
}else{
e = hipErrorInvalidValue;
}
return ihipLogStatus(e);
}
hipError_t hipFree(void* ptr)
{
HIP_INIT_API(ptr);
@@ -1051,6 +1088,7 @@ hipError_t hipFree(void* ptr)
return ihipLogStatus(hipStatus);
}
hipError_t hipHostFree(void* ptr)
{
HIP_INIT_API(ptr);
@@ -1122,7 +1160,7 @@ hipError_t hipMemGetAddressRange ( hipDeviceptr_t* pbase, size_t* psize, hipDevi
}
else
hipStatus = hipErrorInvalidDevicePointer;
return hipStatus;
return ihipLogStatus(hipStatus);
}
@@ -1141,25 +1179,25 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
}
else
hipStatus = hipErrorInvalidResourceHandle;
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
// Save the size of the pointer to hipIpcMemHandle
(*handle)->psize = psize;
iHandle->psize = psize;
#if USE_IPC
// Create HSA ipc memory
hsa_status_t hsa_status =
hsa_amd_ipc_memory_create(devPtr, psize, &(*handle)->ipc_handle);
hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
if(hsa_status!= HSA_STATUS_SUCCESS)
hipStatus = hipErrorMemoryAllocation;
#else
hipStatus = hipErrorRuntimeOther;
#endif
return hipStatus;
return ihipLogStatus(hipStatus);
}
hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){
// HIP_INIT_API ( devPtr, handle.handle , flags);
HIP_INIT_API ( devPtr, &handle , flags);
hipError_t hipStatus = hipSuccess;
#if USE_IPC
@@ -1169,15 +1207,16 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
if(!agent)
return hipErrorInvalidResourceHandle;
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
//Attach ipc memory
hsa_status_t hsa_status =
hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr);
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
if(hsa_status != HSA_STATUS_SUCCESS)
hipStatus = hipErrorMapBufferObjectFailed;
#else
hipStatus = hipErrorRuntimeOther;
#endif
return hipStatus;
return ihipLogStatus(hipStatus);
}
hipError_t hipIpcCloseMemHandle(void *devPtr){
@@ -1192,7 +1231,7 @@ hipError_t hipIpcCloseMemHandle(void *devPtr){
#else
hipStatus = hipErrorRuntimeOther;
#endif
return hipStatus;
return ihipLogStatus(hipStatus);
}
// hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){
+12 -10
Datei anzeigen
@@ -218,31 +218,33 @@ hipError_t hipModuleUnload(hipModule_t hmod)
{
ret = hipErrorInvalidValue;
}
for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
delete *f;
}
delete hmod;
return ihipLogStatus(ret);
}
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name){
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name)
{
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
if(name == nullptr){
if (name == nullptr){
return ihipLogStatus(hipErrorInvalidValue);
}
if(ctx == nullptr){
if (ctx == nullptr){
ret = hipErrorInvalidContext;
}else{
} else {
std::string str(name);
for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
if((*f)->_name == str) {
*func = *f;
}
return ret;
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
if((*f)->_name == str) {
*func = *f;
return ret;
}
}
ihipModuleSymbol_t *sym = new ihipModuleSymbol_t;
int deviceId = ctx->getDevice()->_deviceId;
+1 -1
Datei anzeigen
@@ -46,7 +46,7 @@ __device__ float asinhf(float x)
}
__device__ float atan2f(float y, float x)
{
return hc::precise_math::atan2f(x, y);
return hc::precise_math::atan2f(y, x);
}
__device__ float atanf(float x)
{
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -39,22 +39,67 @@ int main(){
const size_t size = N * sizeof(float);
A = (float*)malloc(size);
HIPCHECK(hipHostRegister(A, size, 0));
for(int i=0;i<N;i++){
A[i] = float(1);
}
for(int i=0;i<num_devices;i++){
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
}
// Reference the registered device pointer Ad from inside the kernel:
for(int i=0;i<num_devices;i++){
HIPCHECK(hipSetDevice(i));
hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/512), dim3(512), 0, 0, Ad[i]);
HIPCHECK(hipDeviceSynchronize());
HIPCHECK(hipSetDevice(i));
hipLaunchKernel(Inc, dim3(N/512), dim3(512), 0, 0, Ad[i]);
HIPCHECK(hipDeviceSynchronize());
}
HIPASSERT(A[10] == 1.0f + float(num_devices));
{
// Sensitize HIP bug if device does not match where the memory was registered.
HIPCHECK(hipSetDevice(0));
// Copy to B, this should be optimal pinned malloc copy:
// Note we are using the host pointer here:
float *Bh, *Bd;
Bh = (float*)malloc(size);
HIPCHECK(hipMalloc(&Bd, size));
HIPCHECK(hipMemset(Bd, 13.0f, size));
for(int i=0;i<N;i++){
A[i] = float(i);
Bh[i] = 0.0f;
}
HIPCHECK(hipMemcpy(Bd, A, size, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Bh, Bd, size, hipMemcpyDeviceToHost));
#if 0
//TODO - disable check HCC patch for registered/locked memory usin device pointers is merged.
for(int i=0;i<N;i++){
if (Bh[i] != A[i]) {
printf ("mismatch at Bh[%d]=%f, A[%d]=%f\n", i, Bh[i], i, A[i]);
failed("mismatch");
};
}
#endif
// Make sure the copy worked
}
HIPCHECK(hipHostUnregister(A));
passed();
}
@@ -0,0 +1,52 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* RUN: %t
* HIT_END
*/
#include"test_common.h"
struct {
float a;
int b;
void *c;
} Struct ;
int main(){
int *iPtr;
float *fPtr;
struct Struct *sPtr;
size_t sSetSize = 1024, sGetSize;
hipMalloc(&iPtr, sSetSize);
hipMalloc(&fPtr, sSetSize);
hipMalloc(&sPtr, sSetSize);
hipMemPtrGetInfo(iPtr, &sGetSize);
assert(sGetSize == sSetSize);
hipMemPtrGetInfo(fPtr, &sGetSize);
assert(sGetSize == sSetSize);
hipMemPtrGetInfo(sPtr, &sGetSize);
assert(sGetSize == sSetSize);
passed();
}