diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 8eecd1650f..82eba49771 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -1278,7 +1278,11 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags); hipError_t hipDeviceDisablePeerAccess (int peerDeviceId); -#ifdef PEER_NON_UNIFIED +#ifndef USE_PEER_NON_UNIFIED +#define USE_PEER_NON_UNIFIED 1 +#endif + +#if USE_PEER_NON_UNIFIED==1 /** * @brief Copies memory from one device to memory on another device. * diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 15fd3a8237..74b0fcbd28 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -73,6 +73,8 @@ int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU ident int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; +int HIP_FORCE_P2P_HOST = 0; + @@ -540,7 +542,7 @@ void ihipCtxCriticalBase_t::recomputePeerAgents() template<> -bool ihipCtxCriticalBase_t::isPeer(const ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); return (match != std::end(_peers)); @@ -548,12 +550,14 @@ bool ihipCtxCriticalBase_t::isPeer(const ihipCtx_t *peer) template<> -bool ihipCtxCriticalBase_t::addPeer(ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); if (match == std::end(_peers)) { // Not already a peer, let's update the list: - _peers.push_back(peer); + tprintf(DB_COPY, "addPeerWatcher. Allocations on %s now visible to peerWatcher %s.\n", + thisCtx->toString().c_str(), peerWatcher->toString().c_str()); + _peers.push_back(peerWatcher); recomputePeerAgents(); return true; } @@ -564,12 +568,14 @@ bool ihipCtxCriticalBase_t::addPeer(ihipCtx_t *peer) template<> -bool ihipCtxCriticalBase_t::removePeer(ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::removePeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); if (match != std::end(_peers)) { // Found a valid peer, let's remove it. - _peers.remove(peer); + tprintf(DB_COPY, "removePeerWatcher. Allocations on %s no longer visible to former peerWatcher %s.\n", + thisCtx->toString().c_str(), peerWatcher->toString().c_str()); + _peers.remove(peerWatcher); recomputePeerAgents(); return true; } else { @@ -579,16 +585,17 @@ bool ihipCtxCriticalBase_t::removePeer(ihipCtx_t *peer) template<> -void ihipCtxCriticalBase_t::resetPeers(ihipCtx_t *thisDevice) +void ihipCtxCriticalBase_t::resetPeerWatchers(ihipCtx_t *thisCtx) { + tprintf(DB_COPY, "resetPeerWatchers for context=%s\n", thisCtx->toString().c_str()); _peers.clear(); _peerCnt = 0; - addPeer(thisDevice); // peer-list always contains self agent. + addPeerWatcher(thisCtx, thisCtx); // peer-list always contains self agent. } template<> -void ihipCtxCriticalBase_t::printPeers(FILE *f) const +void ihipCtxCriticalBase_t::printPeerWatchers(FILE *f) const { for (auto iter = _peers.begin(); iter!=_peers.end(); iter++) { fprintf (f, "%s ", (*iter)->toString().c_str()); @@ -993,7 +1000,7 @@ void ihipCtx_t::locked_reset() // Reset peer list to just me: - crit->resetPeers(this); + crit->resetPeerWatchers(this); // Reset and release all memory stored in the tracker: // Reset will remove peer mapping so don't need to do this explicitly. @@ -1360,7 +1367,7 @@ void ihipInit() READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application"); - + READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copiecopies"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. @@ -1726,14 +1733,14 @@ void ihipSetTs(hipEvent_t e) // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx) { - tprintf (DB_COPY1, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", + tprintf (DB_COPY, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str()); // Use blocks to control scope of critical sections. { LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); tprintf(DB_SYNC, "dstCrit lock succeeded\n"); - if (!ctxCrit->isPeer(thisCtx)) { + if (!ctxCrit->isPeerWatcher(thisCtx)) { return false; }; } @@ -1741,7 +1748,7 @@ bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, { LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); tprintf(DB_SYNC, "srcCrit lock succeeded\n"); - if (!ctxCrit->isPeer(thisCtx)) { + if (!ctxCrit->isPeerWatcher(thisCtx)) { return false; }; } @@ -1832,13 +1839,13 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, if (hcCopyDir == hc::hcMemcpyDeviceToDevice) { if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) { forceHostCopyEngine = true; - tprintf (DB_COPY1, "Forcing use of host copy engine.\n"); + tprintf (DB_COPY, "Forcing use of host copy engine.\n"); } else { - tprintf (DB_COPY1, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + tprintf (DB_COPY, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); } }; - tprintf (DB_COPY1, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", memcpyStr(kind), src, dst, sizeBytes); + tprintf (DB_COPY, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", memcpyStr(kind), src, dst, sizeBytes); { LockedAccessor_StreamCrit_t crit (_criticalData); @@ -1859,12 +1866,12 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes const ihipCtx_t *ctx = this->getCtx(); if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) { - tprintf (DB_COPY1, "locked_copyAsync bad ctx or device\n"); + tprintf (DB_COPY, "locked_copyAsync bad ctx or device\n"); throw ihipException(hipErrorInvalidDevice); } if (kind == hipMemcpyHostToHost) { - tprintf (DB_COPY1, "locked_copyAsync: H2H with memcpy"); + tprintf (DB_COPY, "locked_copyAsync: H2H with memcpy"); // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the host-side sync here and keep everything flowing on the GPU. /* As this is a CPU op, we need to wait until all @@ -1890,7 +1897,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId)); } - tprintf (DB_COPY1, "locked_copyAsync: async memcpy dstTracked=%d srcTracked=%d copyEngineCanSeeSrcAndDest=%d\n", + tprintf (DB_COPY, "locked_copyAsync: async memcpy dstTracked=%d srcTracked=%d copyEngineCanSeeSrcAndDest=%d\n", dstTracked, srcTracked, copyEngineCanSeeSrcAndDest); @@ -1915,6 +1922,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } else { // TODO - call copy_ext directly here? locked_copySync(dst, src, sizeBytes, kind); + //crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); } } } diff --git a/projects/hip/src/hip_hcc.h b/projects/hip/src/hip_hcc.h index e40fa29f7b..ad22789fda 100644 --- a/projects/hip/src/hip_hcc.h +++ b/projects/hip/src/hip_hcc.h @@ -43,6 +43,7 @@ THE SOFTWARE. //static const int debug = 0; extern const int release; +// TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for kernels? extern int HIP_LAUNCH_BLOCKING; extern int HIP_PRINT_ENV; @@ -225,9 +226,8 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */ #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */ #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */ -#define DB_COPY1 3 /* 0x08 - trace memory copy commands. . */ +#define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */ #define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */ -#define DB_COPY2 5 /* 0x20 - trace memory copy commands. Detailed. */ #define DB_MAX_FLAG 5 // When adding a new debug flag, also add to the char name table below. // @@ -242,9 +242,8 @@ static const DbName dbName [] = {KGRN, "api"}, // not used, {KYEL, "sync"}, {KCYN, "mem"}, - {KMAG, "copy1"}, + {KMAG, "copy"}, {KRED, "signal"}, - {KNRM, "copy2"}, }; @@ -596,11 +595,11 @@ public: // Peer Accessor classes: - bool isPeer(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. - bool addPeer(ihipCtx_t *peer); - bool removePeer(ihipCtx_t *peer); - void resetPeers(ihipCtx_t *thisDevice); - void printPeers(FILE *f) const; + bool isPeerWatcher(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. + bool addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer); + bool removePeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer); + void resetPeerWatchers(ihipCtx_t *thisDevice); + void printPeerWatchers(FILE *f) const; uint32_t peerCnt() const { return _peerCnt; }; hsa_agent_t *peerAgents() const { return _peerAgents; }; @@ -750,7 +749,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { os << "ctx:" << static_cast (c) - << " dev:" << c->getDevice()->_deviceId; + << ".dev:" << c->getDevice()->_deviceId; return os; } diff --git a/projects/hip/src/hip_peer.cpp b/projects/hip/src/hip_peer.cpp index e66a0d2971..95ea4719a9 100644 --- a/projects/hip/src/hip_peer.cpp +++ b/projects/hip/src/hip_peer.cpp @@ -35,6 +35,33 @@ THE SOFTWARE. // public APIs are thin wrappers which call into this internal implementations. // TODO - actually not yet - currently the integer deviceId flavors just call the context APIs. need to fix. + + +hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) +{ + hipError_t err = hipSuccess; + + + if ((thisCtx != NULL) && (peerCtx != NULL)) { + if (thisCtx == peerCtx) { + *canAccessPeer = 0; + tprintf(DB_COPY, "Can't be peer to self. (this=%s, peer=%s)\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); + } else { + *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); + tprintf(DB_COPY, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); + } + + } else { + *canAccessPeer = 0; + err = hipErrorInvalidDevice; + } + + return err; +} + + /** * HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P */ @@ -43,23 +70,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_ { HIP_INIT_API(canAccessPeer, thisCtx, peerCtx); - hipError_t err = hipSuccess; - - - if ((thisCtx != NULL) && (peerCtx != NULL)) { - if (thisCtx == peerCtx) { - *canAccessPeer = 0; - } else { - *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); - } - - } else { - *canAccessPeer = 0; - err = hipErrorInvalidDevice; - } - - - return ihipLogStatus(err); + return ihipLogStatus(ihipDeviceCanAccessPeer(canAccessPeer, thisCtx, peerCtx)); } @@ -80,8 +91,10 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) err = hipErrorInvalidDevice; // Can't disable peer access to self. } else { LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); - bool changed = peerCrit->removePeer(thisCtx); + bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx); if (changed) { + tprintf(DB_COPY, "device %s disable access to memory allocated on peer:%s\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); // Update the peers for all memory already saved in the tracker: am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); } else { @@ -112,8 +125,10 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) } else if ((thisCtx != NULL) && (peerCtx != NULL)) { LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); // Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device: - bool isNewPeer = peerCrit->addPeer(thisCtx); + bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); if (isNewPeer) { + tprintf(DB_COPY, "device=%s can now see all memory allocated on peer=%s\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); } else { err = hipErrorPeerAccessAlreadyEnabled; @@ -158,7 +173,7 @@ hipError_t hipMemcpyPeerAsync (void* dst, hipCtx_t dstDevice, const void* src, h hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId) { HIP_INIT_API(canAccessPeer, deviceId, peerDeviceId); - return hipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId)); + return ihipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId)); }