From 698721be345db59bc50dd98dcb25c42a871d5128 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 25 Oct 2017 22:16:37 +0000 Subject: [PATCH 1/3] Enhance debug for copy pointers - show more pointer tracking fields - show pointer info before and after "tailoring' --- hipamd/src/hip_hcc.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 2d67c31fe7..c3d824dbaf 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -1805,7 +1805,7 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, } } else { *forceUnpinnedCopy = true; - tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", + tprintf (DB_COPY, "Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", *copyDevice ? (*copyDevice)->getDeviceNum() : -1, *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); if (HIP_FAIL_SOC & 0x2) { @@ -1820,10 +1820,11 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo) { - tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n", + tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d allocSeqNum=%zu, appAllocationFlags=%x, appPtr=%p\n", tag, ptr, ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes, - ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged); + ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged, + ptrInfo._allocSeqNum, ptrInfo._appAllocationFlags, ptrInfo._appPtr); } @@ -1871,12 +1872,14 @@ void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeByte }; -bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +bool getTailoredPtrInfo(const char *tag, hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) { bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); + printPointerInfo(DB_COPY, tag, ptr, *ptrInfo); if (tracked) { tailorPtrInfo(ptrInfo, ptr, sizeBytes); + printPointerInfo(DB_COPY, " mod", ptr, *ptrInfo); } return tracked; @@ -1906,8 +1909,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); #endif - bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); + bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: @@ -2034,21 +2037,18 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); #endif - bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); + tprintf (DB_COPY, "copyASync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes); + bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); hc::hcCommandKind hcCopyDir; ihipCtx_t *copyDevice; bool forceUnpinnedCopy; resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, &forceUnpinnedCopy); - tprintf (DB_COPY, "copyASync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) src=%p(phys_dev:%d, isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", + tprintf (DB_COPY, " copyDev:%d dir=%s forceUnpinnedCopy=%d\n", copyDevice ? copyDevice->getDeviceNum():-1, - dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem, - src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); - printPointerInfo(DB_COPY, " src", src, srcPtrInfo); + hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. From a4172415071519576de9864b0647267ce4dfef35 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 26 Oct 2017 16:26:25 +0000 Subject: [PATCH 2/3] Fix bug with peer-to-peer combined with context API - Store context inside the tracker rather than using int deviceID that was always mapped to primary context - IsPeerWatcher now based on device IDs rather than specific peers. --- hipamd/src/hip_hcc.cpp | 37 ++++++++++++++++++++++++++++++----- hipamd/src/hip_hcc_internal.h | 11 ++++++++++- hipamd/src/hip_memory.cpp | 8 ++++++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index c3d824dbaf..bcb56d5479 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -47,6 +47,9 @@ THE SOFTWARE. #include "trace_helper.h" #include "env.h" +//TODO - create a stream-based debug interface as an additional option for tprintf +#define DB_PEER_CTX 0 + //================================================================================================= //Global variables: @@ -459,8 +462,20 @@ void ihipCtxCriticalBase_t::recomputePeerAgents() template<> bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t *peer) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find_if(_peers.begin(), _peers.end(), + [=] (const ihipCtx_t *d) { return d->getDeviceNum() == peer->getDeviceNum(); }); + return (match != std::end(_peers)); + +#if 0 + for (auto pi=_peers.begin(); pi != _peers.end(); pi++) { + if ((*pi)->getDeviceNum() == peer->getDeviceNum()) { + return true; + } + } + + return false; +#endif } @@ -1677,18 +1692,24 @@ const char *ihipErrorString(hipError_t hip_error) // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) { - // Make sure this is a device-to-device copy with all memory available to the requested copy engine // // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: if (dstPtrInfo->_sizeBytes == 0) { return false; } else { +#if USE_APP_PTR_FOR_CTX + ihipCtx_t *dstCtx = static_cast (dstPtrInfo->_appPtr); +#else ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); +#endif if (copyEngineCtx != dstCtx) { // Only checks peer list if contexts are different LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); - //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); +#if DB_PEER_CTX + std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx << " peerCnt=" + << ctxCrit->peerCnt() << "\n"; +#endif if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { return false; }; @@ -1696,16 +1717,22 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin } - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: if (srcPtrInfo->_sizeBytes == 0) { return false; } else { +#if USE_APP_PTR_FOR_CTX + ihipCtx_t *srcCtx = static_cast (srcPtrInfo->_appPtr); +#else ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); +#endif if (copyEngineCtx != srcCtx) { // Only checks peer list if contexts are different LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); - //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); +#if DB_PEER_CTX + std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx << " peerCnt=" + << ctxCrit->peerCnt() << "\n"; +#endif if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { return false; }; diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 4b7e533a4c..a48a243a3b 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -32,10 +32,19 @@ THE SOFTWARE. #include "env.h" -#if defined(__HCC__) && (__hcc_workweek__ < 16354) +#if (__hcc_workweek__ < 16354) #error("This version of HIP requires a newer version of HCC."); #endif +// Use the __appPtr field in the am memtracker to store the context. +// Requires a bug fix in HCC +#if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0) +#define USE_APP_PTR_FOR_CTX 1 +#endif + + + + #define USE_IPC 1 //--- diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index a8324c5729..5a4b5f4b4e 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -61,7 +61,11 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) auto device = ctx->getWriteableDevice(); +#if USE_APP_PTR_FOR_CTX + hc::am_memtracker_update(ptr, device->_deviceId, hipFlags, ctx); +#else hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); +#endif if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); @@ -660,7 +664,11 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) vecAcc.push_back(ihipGetDevice(i)->_acc); } am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size()); +#if USE_APP_PTR_FOR_CTX + hc::am_memtracker_update(hostPtr, device->_deviceId, flags, ctx); +#else hc::am_memtracker_update(hostPtr, device->_deviceId, flags); +#endif tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size()); if(am_status == AM_SUCCESS){ From 4c7b2be1c2f3aec242c1f472b969d68e75a2191d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 30 Oct 2017 16:58:03 +0000 Subject: [PATCH 3/3] Check for null copyEngine before looking at peers. --- hipamd/src/hip_hcc.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index bcb56d5479..554ea90d15 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -466,16 +466,6 @@ bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t *peer) [=] (const ihipCtx_t *d) { return d->getDeviceNum() == peer->getDeviceNum(); }); return (match != std::end(_peers)); - -#if 0 - for (auto pi=_peers.begin(); pi != _peers.end(); pi++) { - if ((*pi)->getDeviceNum() == peer->getDeviceNum()) { - return true; - } - } - - return false; -#endif } @@ -1692,6 +1682,10 @@ const char *ihipErrorString(hipError_t hip_error) // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) { + if (copyEngineCtx == nullptr) { + return false; + } + // Make sure this is a device-to-device copy with all memory available to the requested copy engine // // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: