diff --git a/hipamd/include/hcc_detail/hip_hcc.h b/hipamd/include/hcc_detail/hip_hcc.h index 2f85f83851..6c061b01a9 100644 --- a/hipamd/include/hcc_detail/hip_hcc.h +++ b/hipamd/include/hcc_detail/hip_hcc.h @@ -416,6 +416,8 @@ private: // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem); + bool canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx); + private: // Data // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t @@ -516,10 +518,11 @@ public: // Peer Accessor classes: - bool isPeer(const ihipCtx_t *peer); // returns Trus if peer has access to memory physically located on this device. + bool isPeer(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. bool addPeer(ihipCtx_t *peer); bool removePeer(ihipCtx_t *peer); void resetPeers(ihipCtx_t *thisDevice); + void printPeers(FILE *f) const; uint32_t peerCnt() const { return _peerCnt; }; hsa_agent_t *peerAgents() const { return _peerAgents; }; @@ -535,6 +538,7 @@ private: //--- Peer Tracker: // These reflect the currently Enabled set of peers for this GPU: // Enabled peers have permissions to access the memory physically allocated on this device. + // Note the peers always contain the self agent for easy interfacing with HSA APIs. std::list _peers; // list of enabled peer devices. uint32_t _peerCnt; // number of enabled peers hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) @@ -578,6 +582,8 @@ public: // Functions: // TODO - review uses of getWriteableDevice(), can these be converted to getDevice() ihipDevice_t *getWriteableDevice() const { return _device; }; + std::string toString() const; + public: // Data // The NULL stream is used if no other stream is specified. // Default stream has special synchronization properties with other streams. @@ -663,5 +669,12 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) return os; } +inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) +{ + os << "ctx:" << static_cast (c) + << " dev:" << c->getDevice()->_deviceId; + return os; +} + #endif diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 8f15b25a60..1058d96412 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -381,6 +381,18 @@ void ihipCtxCriticalBase_t::resetPeers(ihipCtx_t *thisDevice) } +template<> +void ihipCtxCriticalBase_t::printPeers(FILE *f) const +{ + for (auto iter = _peers.begin(); iter!=_peers.end(); iter++) { + fprintf (f, "%s ", (*iter)->toString().c_str()); + }; +} + + + + + template<> void ihipCtxCriticalBase_t::addStream(ihipStream_t *stream) { @@ -784,6 +796,13 @@ void ihipCtx_t::locked_reset() }; +//--- +std::string ihipCtx_t::toString() const +{ + std::ostringstream ss; + ss << this; + return ss.str(); +}; //---- @@ -1328,8 +1347,36 @@ void ihipSetTs(hipEvent_t e) } +// Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx. +// The peer-list for a context controls which contexts have access to the memory allocated on that context. +// So we check dstCtx's and srcCtx's peerList to see if the booth include thisCtx. +bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx) +{ + tprintf (DB_COPY1, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", + thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str()); + + // Use blocks to control scope of critical sections. + { + LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + if (!ctxCrit->isPeer(thisCtx)) { + return false; + }; + } + + { + LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + if (!ctxCrit->isPeer(thisCtx)) { + return false; + }; + } + + return true; +}; + + // Resolve hipMemcpyDefault to a known type. +// TODO - review why is this so complicated, does this need srcTracked and dstTracked? unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem) { hipMemcpyKind kind = hipMemcpyDefault; @@ -1358,6 +1405,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, } +// TODO - remove kind parm from here or use it below? void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { ihipCtx_t *ctx = this->getCtx(); @@ -1367,7 +1415,38 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const throw ihipException(hipErrorInvalidDevice); } - crit->_av.copy(src, dst, sizeBytes); + hc::accelerator acc; + hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); + hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); + bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); + bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + + if (kind == hipMemcpyDefault) { + kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); + } + hc::hcCommandKind hcCopyDir; + switch (kind) { + case hipMemcpyHostToHost: hcCopyDir = hc::hcMemcpyHostToHost; break; + case hipMemcpyHostToDevice: hcCopyDir = hc::hcMemcpyHostToDevice; break; + case hipMemcpyDeviceToHost: hcCopyDir = hc::hcMemcpyDeviceToHost; break; + case hipMemcpyDeviceToDevice: hcCopyDir = hc::hcMemcpyDeviceToDevice; break; + }; + + + // If this is P2P accessi, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) + // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers + // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true). + bool forceHostCopyEngine = false; + if (hcCopyDir == hc::hcMemcpyDeviceToDevice) { + if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) { + forceHostCopyEngine = true; + tprintf (DB_COPY1, "Forcing use of host copy engine.\n"); + } else { + tprintf (DB_COPY1, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + } + }; + + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); } @@ -1410,16 +1489,23 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); - // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. - // If both pointers are not tracked, we need to fall back to a sync copy. - if (!dstTracked || !srcTracked) { - trueAsync = false; + bool copyEngineCanSeeSrcAndDest = true; + if (kind == hipMemcpyDeviceToDevice) { + copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId)); } + + // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. + // If both pointers are not tracked, we need to fall back to a sync copy. + if (!dstTracked || !srcTracked || !copyEngineCanSeeSrcAndDest) { + trueAsync = false; + } + + if (trueAsync == true) { - // Perform a syncrhonous copy: + // Perform a synchronous copy: try { crit->_av.copy_async(src, dst, sizeBytes); } catch (Kalmar::runtime_exception) { @@ -1432,10 +1518,8 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig this->wait(crit); } } else { - // Perform a syncrhonous copy: + // Perform a synchronous copy: if (kind == hipMemcpyDefault) { - bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem); - bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem); kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); } copySync(crit, dst, src, sizeBytes, kind); @@ -1481,3 +1565,4 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a } //// TODO - add identifier numbers for streams and devices to help with debugging. +//TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 668852bbfe..b973ba977b 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -104,7 +104,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes == 0) { *ptr = NULL; - return ihipLogStatus(hip_status); + return ihipLogStatus(hipSuccess); } auto ctx = ihipGetTlsDefaultCtx(); diff --git a/hipamd/tests/src/hipPeerToPeer_simple.cpp b/hipamd/tests/src/hipPeerToPeer_simple.cpp index 71d073b1b2..de89b36f46 100644 --- a/hipamd/tests/src/hipPeerToPeer_simple.cpp +++ b/hipamd/tests/src/hipPeerToPeer_simple.cpp @@ -33,6 +33,15 @@ int p_peerDevice = -1; // explicly specify which peer to use, else use p_gpuDe int g_currentDevice; int g_peerDevice; +void help(char *argv[]) +{ + printf ("usage: %s [OPTIONS]\n", argv[0]); + printf (" --memcpyWithPeer : Perform memcpy with peer.\n"); + printf (" --mirrorPeersi : Mirror memory onto both default device and peerdevice. If 0, memory is mapped only on the default device.\n"); + printf (" --peerDevice N : Set peer device.\n"); +}; + + void parseMyArguments(int argc, char *argv[]) { int more_argc = HipTest::parseStandardArguments(argc, argv, false); @@ -40,7 +49,10 @@ void parseMyArguments(int argc, char *argv[]) for (int i = 1; i < more_argc; i++) { const char *arg = argv[i]; - if (!strcmp(arg, "--memcpyWithPeer")) { + if (!strcmp(arg, "--help")) { + help(argv); + exit(-1); + } else if (!strcmp(arg, "--memcpyWithPeer")) { p_memcpyWithPeer = true; } else if (!strcmp(arg, "--mirrorPeers")) { p_mirrorPeers = true; @@ -90,10 +102,12 @@ void enablePeerFirst() setupPeerTests(); + // Always enable g_currentDevice to see the allocations on peerDevice. HIPCHECK(hipSetDevice(g_currentDevice)); HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0)); if (p_mirrorPeers) { + // Mirror peers allows the peer device to see the allocations on currentDevice. int canAccessPeer; HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_peerDevice, g_currentDevice)); assert(canAccessPeer); @@ -122,6 +136,8 @@ void enablePeerFirst() // Device0 push to device1, using P2P: + // NOTE : if p_mirrorPeers=0 and p_memcpyWithPeer=1, then peer device does not have mapping for A_d1 and we need to use a + // a host staging copy for the P2P access. HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice)); HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault)); // This is P2P copy.