From ff9bed653584a34cf5fafa3545a1ad1f22c74ca2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 May 2017 17:32:25 -0500 Subject: [PATCH] hipHostMalloc allocation are mapped to all devices by default. Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory. --- hipamd/src/hip_hcc.cpp | 9 ++ hipamd/src/hip_hcc_internal.h | 1 + hipamd/src/hip_memory.cpp | 58 ++++---- hipamd/tests/src/hipPointerAttrib.cpp | 2 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 129 +++++++++++++----- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 40 +----- hipamd/tests/src/test_common.h | 38 ++++++ 7 files changed, 181 insertions(+), 96 deletions(-) diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 71d947488d..81a2079b5b 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -118,6 +118,7 @@ bool g_visible_device = false; unsigned g_deviceCnt; std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; +hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); @@ -1389,6 +1390,14 @@ void ihipInit() g_deviceCnt++; } } + + g_allAgents = static_cast (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t))); + g_allAgents[0] = g_cpu_agent; + for (int i=0; i_hsaAgent; + } + + g_numLogicalThreads = std::thread::hardware_concurrency(); // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 9c17c6e98c..132f099ce8 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -826,6 +826,7 @@ private: // Critical data, protected with locked access: extern std::once_flag hip_initialized; extern unsigned g_deviceCnt; extern hsa_agent_t g_cpu_agent ; // the CPU agent. +extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. //================================================================================================= // Extern functions: diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 1ba698f461..c4f0f64e50 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK } // return 0 on success or -1 on error: -int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) { int ret = 0; auto device = ctx->getWriteableDevice(); hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); - if (peerCnt > 1) { - //printf ("peer self access\n"); + if (shareWithAll) { + hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } else { + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; + //printf ("peer self access\n"); - hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } } } } @@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) // Allocate a new pointer with am_alloc and share with all valid peers. // Returns null-ptr if a memory error occurs (either allocation or sharing) -void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags) { void *ptr = nullptr; @@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { - int r = sharePtr(ptr, ctx, hipFlags); + int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); if (r != 0) { ptr = nullptr; } @@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } else { auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/); if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { - trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; + // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; } const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; @@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto device = ctx->getWriteableDevice(); unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", - sizeBytes, ctx, amFlags, flags); + sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; } @@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, hip_status = hipErrorUnknown; break; } - *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } diff --git a/hipamd/tests/src/hipPointerAttrib.cpp b/hipamd/tests/src/hipPointerAttrib.cpp index fb7832d9a6..7a2ab64bea 100644 --- a/hipamd/tests/src/hipPointerAttrib.cpp +++ b/hipamd/tests/src/hipPointerAttrib.cpp @@ -99,7 +99,7 @@ inline int zrand(int max) //================================================================================================= -// Functins to run tests +// Functions to run tests //================================================================================================= //-- //Run through a couple simple cases to test lookups and host pointer arithmetic: diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 0a256d6362..1ee5cbc9bb 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -25,45 +25,106 @@ THE SOFTWARE. #include"test_common.h" -#define SIZE 1024*1024*256 +#define NUM_ELEMENTS 1024*1024*64 +#define SIZE NUM_ELEMENTS*sizeof(int) -int main(){ - float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0; - B = (float*)malloc(SIZE); - hipMalloc((void**)&Ad, SIZE); - hipHostMalloc((void**)&B, SIZE); - hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault); - hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped); - hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped); - - hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/); - - HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess); - - HIPCHECK_API(hipFree(Ad) , hipSuccess); - HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue); - - HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory - HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(ptr_0) , hipSuccess); - HIPCHECK_API(hipHostFree(Bd) , hipSuccess); - HIPCHECK_API(hipHostFree(Bm) , hipSuccess); - - HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipHostFree(C) , hipSuccess); +int p_count = 4; - HIPCHECK_API(hipFree(NULL) , hipSuccess); - HIPCHECK_API(hipHostFree(NULL) , hipSuccess); +void multiGpuHostAlloc(int allocDevice) +{ + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + assert(numDevices > 1); + + printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); + + + HIPCHECK(hipSetDevice(allocDevice)); + + int *Ah, *Ch; + hipHostMalloc((void**)&Ah, SIZE); + hipHostMalloc((void**)&Ch, SIZE); + + const int init = -1; + for (size_t i=0; i 1); + + multiGpuHostAlloc(0); + multiGpuHostAlloc(1); } passed(); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index adf0d4af0c..80ff7ad98d 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -35,42 +35,6 @@ unsigned p_count = 100; -template -__global__ void -addCount( const T *A_d, - T *C_d, - size_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i -__global__ void -addCountReverse( const T *A_d, - T *C_d, - int64_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i=0; i-=stride) { - C_d[i] = A_d[i] + (T)count; - } - } -} //------ @@ -171,9 +135,9 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_reverse) { - hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } else { - hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } HIPCHECK(hipEventRecord(_event, _stream)); diff --git a/hipamd/tests/src/test_common.h b/hipamd/tests/src/test_common.h index 1250de4801..633ee6f825 100644 --- a/hipamd/tests/src/test_common.h +++ b/hipamd/tests/src/test_common.h @@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +addCount( const T *A_d, + T *C_d, + size_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i +__global__ void +addCountReverse( const T *A_d, + T *C_d, + int64_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + template void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false)