Fix bug with peer-to-peer combined with context API
- Store context inside the tracker rather than using int deviceID that
was always mapped to primary context
- IsPeerWatcher now based on device IDs rather than specific peers.
[ROCm/clr commit: a417241507]
This commit is contained in:
@@ -47,6 +47,9 @@ THE SOFTWARE.
|
||||
#include "trace_helper.h"
|
||||
#include "env.h"
|
||||
|
||||
//TODO - create a stream-based debug interface as an additional option for tprintf
|
||||
#define DB_PEER_CTX 0
|
||||
|
||||
|
||||
//=================================================================================================
|
||||
//Global variables:
|
||||
@@ -459,8 +462,20 @@ void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
|
||||
template<>
|
||||
bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
|
||||
{
|
||||
auto match = std::find(_peers.begin(), _peers.end(), peer);
|
||||
auto match = std::find_if(_peers.begin(), _peers.end(),
|
||||
[=] (const ihipCtx_t *d) { return d->getDeviceNum() == peer->getDeviceNum(); });
|
||||
|
||||
return (match != std::end(_peers));
|
||||
|
||||
#if 0
|
||||
for (auto pi=_peers.begin(); pi != _peers.end(); pi++) {
|
||||
if ((*pi)->getDeviceNum() == peer->getDeviceNum()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -1677,18 +1692,24 @@ const char *ihipErrorString(hipError_t hip_error)
|
||||
// So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx.
|
||||
bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo)
|
||||
{
|
||||
|
||||
// Make sure this is a device-to-device copy with all memory available to the requested copy engine
|
||||
//
|
||||
// TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here:
|
||||
if (dstPtrInfo->_sizeBytes == 0) {
|
||||
return false;
|
||||
} else {
|
||||
#if USE_APP_PTR_FOR_CTX
|
||||
ihipCtx_t *dstCtx = static_cast<ihipCtx_t*> (dstPtrInfo->_appPtr);
|
||||
#else
|
||||
ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId);
|
||||
#endif
|
||||
if (copyEngineCtx != dstCtx) {
|
||||
// Only checks peer list if contexts are different
|
||||
LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData());
|
||||
//tprintf(DB_SYNC, "dstCrit lock succeeded\n");
|
||||
#if DB_PEER_CTX
|
||||
std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx << " peerCnt="
|
||||
<< ctxCrit->peerCnt() << "\n";
|
||||
#endif
|
||||
if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
|
||||
return false;
|
||||
};
|
||||
@@ -1696,16 +1717,22 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin
|
||||
}
|
||||
|
||||
|
||||
|
||||
// TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here:
|
||||
if (srcPtrInfo->_sizeBytes == 0) {
|
||||
return false;
|
||||
} else {
|
||||
#if USE_APP_PTR_FOR_CTX
|
||||
ihipCtx_t *srcCtx = static_cast<ihipCtx_t*> (srcPtrInfo->_appPtr);
|
||||
#else
|
||||
ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId);
|
||||
#endif
|
||||
if (copyEngineCtx != srcCtx) {
|
||||
// Only checks peer list if contexts are different
|
||||
LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData());
|
||||
//tprintf(DB_SYNC, "srcCrit lock succeeded\n");
|
||||
#if DB_PEER_CTX
|
||||
std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx << " peerCnt="
|
||||
<< ctxCrit->peerCnt() << "\n";
|
||||
#endif
|
||||
if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
|
||||
return false;
|
||||
};
|
||||
|
||||
@@ -32,10 +32,19 @@ THE SOFTWARE.
|
||||
#include "env.h"
|
||||
|
||||
|
||||
#if defined(__HCC__) && (__hcc_workweek__ < 16354)
|
||||
#if (__hcc_workweek__ < 16354)
|
||||
#error("This version of HIP requires a newer version of HCC.");
|
||||
#endif
|
||||
|
||||
// Use the __appPtr field in the am memtracker to store the context.
|
||||
// Requires a bug fix in HCC
|
||||
#if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
|
||||
#define USE_APP_PTR_FOR_CTX 1
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
#define USE_IPC 1
|
||||
|
||||
//---
|
||||
|
||||
@@ -61,7 +61,11 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
|
||||
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
#if USE_APP_PTR_FOR_CTX
|
||||
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags, ctx);
|
||||
#else
|
||||
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
|
||||
#endif
|
||||
|
||||
if (shareWithAll) {
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
|
||||
@@ -660,7 +664,11 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
|
||||
vecAcc.push_back(ihipGetDevice(i)->_acc);
|
||||
}
|
||||
am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size());
|
||||
#if USE_APP_PTR_FOR_CTX
|
||||
hc::am_memtracker_update(hostPtr, device->_deviceId, flags, ctx);
|
||||
#else
|
||||
hc::am_memtracker_update(hostPtr, device->_deviceId, flags);
|
||||
#endif
|
||||
|
||||
tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size());
|
||||
if(am_status == AM_SUCCESS){
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user