From 49d7ea94f5d5702eeb2b3dbb439cadaa56937a82 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 10 Mar 2017 15:04:46 -0600 Subject: [PATCH] Refactor registered memory calls. [ROCm/clr commit: b7acb85fa81844b84f407c8524ba1e4bb86dca7c] --- .../include/hip/hcc_detail/hip_runtime_api.h | 11 + .../hipBusBandwidth/hipBusBandwidth.cpp | 52 +++- projects/clr/hipamd/src/hip_memory.cpp | 233 +++++++++--------- 3 files changed, 176 insertions(+), 120 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 080f82d9ed..7f85aad28d 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -858,6 +858,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) * @param[out] ptr Pointer to the allocated memory * @param[in] size Requested memory size * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess * * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc @@ -870,6 +872,8 @@ hipError_t hipMalloc(void** ptr, size_t size) ; * @param[out] ptr Pointer to the allocated host pinned memory * @param[in] size Requested memory size * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @deprecated use hipHostMalloc() instead @@ -883,6 +887,8 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use * @param[in] size Requested memory size * @param[in] flags Type of host memory allocation * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @see hipSetDeviceFlags, hipHostFree @@ -896,6 +902,8 @@ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ; * @param[in] size Requested memory size * @param[in] flags Type of host memory allocation * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return #hipSuccess, #hipErrorMemoryAllocation * * @deprecated use hipHostMalloc() instead @@ -980,6 +988,9 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[out] pitch Pitch for allocation (in bytes) * @param[in] width Requested pitched allocation width (in bytes) * @param[in] height Requested pitched allocation height + * + * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. + * * @return Error code * * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index a1b2fd1705..09f78543c9 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -24,7 +24,7 @@ bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; -#define NO_CHECK +//#define NO_CHECK #define CHECK_HIP_ERROR() \ @@ -151,6 +151,10 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0); CHECK_HIP_ERROR(); } + else + { + assert(0); + } for (int i = 0; i < numMaxFloats; i++) { @@ -323,6 +327,22 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) hostMem1 = new float[numMaxFloats]; hostMem2 = new float[numMaxFloats]; } + else if (p_malloc_mode == MallocRegistered) + { + if (p_numa_ctl == -1) { + hostMem1 = (float*)malloc(numMaxFloats*sizeof(float)); + hostMem2 = (float*)malloc(numMaxFloats*sizeof(float)); + } + + hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0); + CHECK_HIP_ERROR(); + hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0); + CHECK_HIP_ERROR(); + } + else + { + assert(0); + } for (int i=0; i + + +// Internal HIP APIS: +namespace hip_internal { + +hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) +{ + hipError_t e = hipSuccess; + + stream = ihipSyncAndResolveStream(stream); + + + if ((dst == NULL) || (src == NULL)) { + e= hipErrorInvalidValue; + } else if (stream) { + try { + stream->locked_copyAsync(dst, src, sizeBytes, kind); + } + catch (ihipException ex) { + e = ex._code; + } + } else { + e = hipErrorInvalidValue; + } + + return e; +} + +// return 0 on success or -1 on error: +int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +{ + int ret = 0; + + auto device = ctx->getWriteableDevice(); + + hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { + + //printf ("peer self access\n"); + + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } + } + + return ret; +} + + + + +// Allocate a new pointer with am_alloc and share with all valid peers. +// Returns null-ptr if a memory error occurs (either allocation or sharing) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +{ + + void *ptr = nullptr; + + auto device = ctx->getWriteableDevice(); + + ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); + tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n", + msg, ptr, sizeBytes, device->_deviceId); + + if (ptr != nullptr) { + int r = sharePtr(ptr, ctx, hipFlags); + if (r != 0) { + ptr = nullptr; + } + } + + return ptr; +} + + +} // end namespace hip_internal + //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------------------- // Memory @@ -128,37 +218,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (ctx) { auto device = ctx->getWriteableDevice(); - const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", - *ptr, sizeBytes, device->_deviceId, peerCnt-1); - if (peerCnt > 1) { - - //printf ("peer self access\n"); - - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; - - hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (e != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } } else { hip_status = hipErrorMemoryAllocation; } @@ -198,39 +259,16 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { auto device = ctx->getWriteableDevice(); - if(HIP_COHERENT_HOST_ALLOC){ - // Force to allocate finedgrained system memory - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes < 1 && (*ptr == NULL)){ - hip_status = hipErrorMemoryAllocation; - } else { - // TODO - should OR in flags here? - hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); - } - tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr); - } - else{ - // TODO - am_alloc requires writeable __acc, perhaps could be refactored? - // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC. - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if (*ptr == NULL) { - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, flags); - // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - peerCnt = crit->peerCnt(); - if (peerCnt > 1) { - hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - } - } - tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); - } - } + unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", + sizeBytes, ctx, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } } } + if (HIP_SYNC_HOST_ALLOC) { hipDeviceSynchronize(); } @@ -272,22 +310,11 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved: - hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (hsa_status != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } + } } else { hip_status = hipErrorMemoryAllocation; } @@ -321,41 +348,31 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, void ** ptr = &array[0]->data; if (ctx) { - auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; const size_t size = width*height; + size_t allocSize = 0; switch(desc->f) { case hipChannelFormatKindSigned: - *ptr = hc::am_alloc(size*sizeof(int), device->_acc, am_flags); + allocSize = size * sizeof(int); break; case hipChannelFormatKindUnsigned: - *ptr = hc::am_alloc(size*sizeof(unsigned int), device->_acc, am_flags); + allocSize = size * sizeof(unsigned int); break; case hipChannelFormatKindFloat: - *ptr = hc::am_alloc(size*sizeof(float), device->_acc, am_flags); + allocSize = size * sizeof(float); break; case hipChannelFormatKindNone: - *ptr = hc::am_alloc(size*sizeof(size_t), device->_acc, am_flags); + allocSize = size * sizeof(size_t); break; default: hip_status = hipErrorUnknown; break; } + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, 0); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved: - hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); - if (hsa_status != HSA_STATUS_SUCCESS) { - hip_status = hipErrorMemoryAllocation; - } - } - } - } + } } else { hip_status = hipErrorMemoryAllocation; @@ -409,12 +426,13 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorHostMemoryAlreadyRegistered; } else { auto ctx = ihipGetTlsDefaultCtx(); - if(hostPtr == NULL){ + if (hostPtr == NULL) { return ihipLogStatus(hipErrorInvalidValue); } + //TODO-test : multi-gpu access to registered host memory. if (ctx) { - auto device = ctx->getWriteableDevice(); if(flags == hipHostRegisterDefault || flags == hipHostRegisterPortable || flags == hipHostRegisterMapped){ + auto device = ctx->getWriteableDevice(); std::vectorvecAcc; for(int i=0;i_acc); @@ -711,32 +729,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) -// Internal copy sync: -namespace hip_internal { - -hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) -{ - hipError_t e = hipSuccess; - - stream = ihipSyncAndResolveStream(stream); - - - if ((dst == NULL) || (src == NULL)) { - e= hipErrorInvalidValue; - } else if (stream) { - try { - stream->locked_copyAsync(dst, src, sizeBytes, kind); - } - catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return e; -} -} // end namespace hip_internal hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) @@ -1012,6 +1004,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) return ihipLogStatus(e); } + hipError_t hipMemGetInfo (size_t *free, size_t *total) { HIP_INIT_API(free, total); @@ -1067,6 +1060,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) return ihipLogStatus(e); } + hipError_t hipFree(void* ptr) { HIP_INIT_API(ptr); @@ -1094,6 +1088,7 @@ hipError_t hipFree(void* ptr) return ihipLogStatus(hipStatus); } + hipError_t hipHostFree(void* ptr) { HIP_INIT_API(ptr);