Peer-to-Peer improvements.
- Bug fix for peer visibility. Now contexts correctly detect when they can use SDMA for P2P vs staging buffers. - Interface to new HCC copy_ext function. - Improve context and peer print /debug options. - Add comments and usage to hipPeerToPeer_simple test.
Этот коммит содержится в:
@@ -416,6 +416,8 @@ private:
|
||||
// The unsigned return is hipMemcpyKind
|
||||
unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem);
|
||||
|
||||
bool canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx);
|
||||
|
||||
|
||||
private: // Data
|
||||
// Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
|
||||
@@ -516,10 +518,11 @@ public:
|
||||
|
||||
|
||||
// Peer Accessor classes:
|
||||
bool isPeer(const ihipCtx_t *peer); // returns Trus if peer has access to memory physically located on this device.
|
||||
bool isPeer(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device.
|
||||
bool addPeer(ihipCtx_t *peer);
|
||||
bool removePeer(ihipCtx_t *peer);
|
||||
void resetPeers(ihipCtx_t *thisDevice);
|
||||
void printPeers(FILE *f) const;
|
||||
|
||||
uint32_t peerCnt() const { return _peerCnt; };
|
||||
hsa_agent_t *peerAgents() const { return _peerAgents; };
|
||||
@@ -535,6 +538,7 @@ private:
|
||||
//--- Peer Tracker:
|
||||
// These reflect the currently Enabled set of peers for this GPU:
|
||||
// Enabled peers have permissions to access the memory physically allocated on this device.
|
||||
// Note the peers always contain the self agent for easy interfacing with HSA APIs.
|
||||
std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
|
||||
uint32_t _peerCnt; // number of enabled peers
|
||||
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
|
||||
@@ -578,6 +582,8 @@ public: // Functions:
|
||||
// TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
|
||||
ihipDevice_t *getWriteableDevice() const { return _device; };
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
public: // Data
|
||||
// The NULL stream is used if no other stream is specified.
|
||||
// Default stream has special synchronization properties with other streams.
|
||||
@@ -663,5 +669,12 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e)
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c)
|
||||
{
|
||||
os << "ctx:" << static_cast<const void*> (c)
|
||||
<< " dev:" << c->getDevice()->_deviceId;
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -381,6 +381,18 @@ void ihipCtxCriticalBase_t<CtxMutex>::resetPeers(ihipCtx_t *thisDevice)
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
void ihipCtxCriticalBase_t<CtxMutex>::printPeers(FILE *f) const
|
||||
{
|
||||
for (auto iter = _peers.begin(); iter!=_peers.end(); iter++) {
|
||||
fprintf (f, "%s ", (*iter)->toString().c_str());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
template<>
|
||||
void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
{
|
||||
@@ -784,6 +796,13 @@ void ihipCtx_t::locked_reset()
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
std::string ihipCtx_t::toString() const
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << this;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
//----
|
||||
|
||||
@@ -1328,8 +1347,36 @@ void ihipSetTs(hipEvent_t e)
|
||||
}
|
||||
|
||||
|
||||
// Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx.
|
||||
// The peer-list for a context controls which contexts have access to the memory allocated on that context.
|
||||
// So we check dstCtx's and srcCtx's peerList to see if the booth include thisCtx.
|
||||
bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx)
|
||||
{
|
||||
tprintf (DB_COPY1, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n",
|
||||
thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str());
|
||||
|
||||
// Use blocks to control scope of critical sections.
|
||||
{
|
||||
LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData());
|
||||
if (!ctxCrit->isPeer(thisCtx)) {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
{
|
||||
LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData());
|
||||
if (!ctxCrit->isPeer(thisCtx)) {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Resolve hipMemcpyDefault to a known type.
|
||||
// TODO - review why is this so complicated, does this need srcTracked and dstTracked?
|
||||
unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem)
|
||||
{
|
||||
hipMemcpyKind kind = hipMemcpyDefault;
|
||||
@@ -1358,6 +1405,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked,
|
||||
}
|
||||
|
||||
|
||||
// TODO - remove kind parm from here or use it below?
|
||||
void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
|
||||
{
|
||||
ihipCtx_t *ctx = this->getCtx();
|
||||
@@ -1367,7 +1415,38 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
throw ihipException(hipErrorInvalidDevice);
|
||||
}
|
||||
|
||||
crit->_av.copy(src, dst, sizeBytes);
|
||||
hc::accelerator acc;
|
||||
hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
|
||||
hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
|
||||
bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
|
||||
bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
|
||||
|
||||
if (kind == hipMemcpyDefault) {
|
||||
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem);
|
||||
}
|
||||
hc::hcCommandKind hcCopyDir;
|
||||
switch (kind) {
|
||||
case hipMemcpyHostToHost: hcCopyDir = hc::hcMemcpyHostToHost; break;
|
||||
case hipMemcpyHostToDevice: hcCopyDir = hc::hcMemcpyHostToDevice; break;
|
||||
case hipMemcpyDeviceToHost: hcCopyDir = hc::hcMemcpyDeviceToHost; break;
|
||||
case hipMemcpyDeviceToDevice: hcCopyDir = hc::hcMemcpyDeviceToDevice; break;
|
||||
};
|
||||
|
||||
|
||||
// If this is P2P accessi, we need to check to see if the copy agent (specified by the stream where the copy is enqueued)
|
||||
// has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers
|
||||
// and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true).
|
||||
bool forceHostCopyEngine = false;
|
||||
if (hcCopyDir == hc::hcMemcpyDeviceToDevice) {
|
||||
if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) {
|
||||
forceHostCopyEngine = true;
|
||||
tprintf (DB_COPY1, "Forcing use of host copy engine.\n");
|
||||
} else {
|
||||
tprintf (DB_COPY1, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str());
|
||||
}
|
||||
};
|
||||
|
||||
crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine);
|
||||
}
|
||||
|
||||
|
||||
@@ -1410,16 +1489,23 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig
|
||||
bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
|
||||
|
||||
|
||||
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
|
||||
// If both pointers are not tracked, we need to fall back to a sync copy.
|
||||
if (!dstTracked || !srcTracked) {
|
||||
trueAsync = false;
|
||||
bool copyEngineCanSeeSrcAndDest = true;
|
||||
if (kind == hipMemcpyDeviceToDevice) {
|
||||
copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
|
||||
// If both pointers are not tracked, we need to fall back to a sync copy.
|
||||
if (!dstTracked || !srcTracked || !copyEngineCanSeeSrcAndDest) {
|
||||
trueAsync = false;
|
||||
}
|
||||
|
||||
|
||||
if (trueAsync == true) {
|
||||
// Perform a syncrhonous copy:
|
||||
// Perform a synchronous copy:
|
||||
try {
|
||||
crit->_av.copy_async(src, dst, sizeBytes);
|
||||
} catch (Kalmar::runtime_exception) {
|
||||
@@ -1432,10 +1518,8 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig
|
||||
this->wait(crit);
|
||||
}
|
||||
} else {
|
||||
// Perform a syncrhonous copy:
|
||||
// Perform a synchronous copy:
|
||||
if (kind == hipMemcpyDefault) {
|
||||
bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem);
|
||||
bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem);
|
||||
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem);
|
||||
}
|
||||
copySync(crit, dst, src, sizeBytes, kind);
|
||||
@@ -1481,3 +1565,4 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
|
||||
}
|
||||
|
||||
//// TODO - add identifier numbers for streams and devices to help with debugging.
|
||||
//TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)
|
||||
|
||||
@@ -104,7 +104,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
if (sizeBytes == 0)
|
||||
{
|
||||
*ptr = NULL;
|
||||
return ihipLogStatus(hip_status);
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
@@ -33,6 +33,15 @@ int p_peerDevice = -1; // explicly specify which peer to use, else use p_gpuDe
|
||||
int g_currentDevice;
|
||||
int g_peerDevice;
|
||||
|
||||
void help(char *argv[])
|
||||
{
|
||||
printf ("usage: %s [OPTIONS]\n", argv[0]);
|
||||
printf (" --memcpyWithPeer : Perform memcpy with peer.\n");
|
||||
printf (" --mirrorPeersi : Mirror memory onto both default device and peerdevice. If 0, memory is mapped only on the default device.\n");
|
||||
printf (" --peerDevice N : Set peer device.\n");
|
||||
};
|
||||
|
||||
|
||||
void parseMyArguments(int argc, char *argv[])
|
||||
{
|
||||
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
|
||||
@@ -40,7 +49,10 @@ void parseMyArguments(int argc, char *argv[])
|
||||
for (int i = 1; i < more_argc; i++) {
|
||||
const char *arg = argv[i];
|
||||
|
||||
if (!strcmp(arg, "--memcpyWithPeer")) {
|
||||
if (!strcmp(arg, "--help")) {
|
||||
help(argv);
|
||||
exit(-1);
|
||||
} else if (!strcmp(arg, "--memcpyWithPeer")) {
|
||||
p_memcpyWithPeer = true;
|
||||
} else if (!strcmp(arg, "--mirrorPeers")) {
|
||||
p_mirrorPeers = true;
|
||||
@@ -90,10 +102,12 @@ void enablePeerFirst()
|
||||
|
||||
setupPeerTests();
|
||||
|
||||
// Always enable g_currentDevice to see the allocations on peerDevice.
|
||||
HIPCHECK(hipSetDevice(g_currentDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0));
|
||||
|
||||
if (p_mirrorPeers) {
|
||||
// Mirror peers allows the peer device to see the allocations on currentDevice.
|
||||
int canAccessPeer;
|
||||
HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_peerDevice, g_currentDevice));
|
||||
assert(canAccessPeer);
|
||||
@@ -122,6 +136,8 @@ void enablePeerFirst()
|
||||
|
||||
|
||||
// Device0 push to device1, using P2P:
|
||||
// NOTE : if p_mirrorPeers=0 and p_memcpyWithPeer=1, then peer device does not have mapping for A_d1 and we need to use a
|
||||
// a host staging copy for the P2P access.
|
||||
HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice));
|
||||
HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault)); // This is P2P copy.
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user