Context management related changes in HIP.
-
-Contexts across threads are listed under device
-Device reset cleans up all contexts and re-initializes _primaryCtx
Change-Id: Ie1cfbb26d43a8dc6869be3e6ebaf7344ce374643
[ROCm/clr commit: c837b8d713]
Этот коммит содержится в:
@@ -58,9 +58,15 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
|
||||
HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
|
||||
hipError_t e = hipSuccess;
|
||||
auto deviceHandle = ihipGetDevice(device);
|
||||
*ctx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags);
|
||||
ihipSetTlsDefaultCtx(*ctx);
|
||||
tls_ctxStack.push(*ctx);
|
||||
{
|
||||
// Obtain mutex access to the device critical data, release by destructor
|
||||
LockedAccessor_DeviceCrit_t deviceCrit(deviceHandle->criticalData());
|
||||
auto ictx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags);
|
||||
*ctx = ictx;
|
||||
ihipSetTlsDefaultCtx(*ctx);
|
||||
tls_ctxStack.push(*ctx);
|
||||
deviceCrit->addContext(ictx);
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
@@ -125,6 +131,11 @@ hipError_t hipCtxDestroy(hipCtx_t ctx)
|
||||
//need to destroy the ctx associated with calling thread
|
||||
tls_ctxStack.pop();
|
||||
}
|
||||
{
|
||||
auto deviceHandle = ctx->getWriteableDevice();
|
||||
deviceHandle->locked_removeContext(ctx);
|
||||
ctx->locked_reset();
|
||||
}
|
||||
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
|
||||
}
|
||||
|
||||
|
||||
@@ -166,11 +166,16 @@ hipError_t hipDeviceReset(void)
|
||||
// This function currently does a user-level cleanup of known resources.
|
||||
// It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries.
|
||||
|
||||
|
||||
#if 0
|
||||
if (ctx) {
|
||||
// Release ctx resources (streams and memory):
|
||||
ctx->locked_reset();
|
||||
}
|
||||
#endif
|
||||
if (ctx) {
|
||||
ihipDevice_t *deviceHandle = ctx->getWriteableDevice();
|
||||
deviceHandle->locked_reset();
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
@@ -522,6 +522,14 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
_streams.push_back(stream);
|
||||
tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str());
|
||||
}
|
||||
|
||||
template<>
|
||||
void ihipDeviceCriticalBase_t<DeviceMutex>::addContext(ihipCtx_t *ctx)
|
||||
{
|
||||
_ctxs.push_back(ctx);
|
||||
tprintf(DB_SYNC, " addContext: %s\n", ToString(ctx).c_str());
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
|
||||
//=================================================================================================
|
||||
@@ -530,7 +538,8 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
|
||||
_deviceId(deviceId),
|
||||
_acc(acc),
|
||||
_state(0)
|
||||
_state(0),
|
||||
_criticalData(this)
|
||||
{
|
||||
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
|
||||
if (agent) {
|
||||
@@ -557,7 +566,49 @@ ihipDevice_t::~ihipDevice_t()
|
||||
_primaryCtx = NULL;
|
||||
}
|
||||
|
||||
void ihipDevice_t::locked_removeContext(ihipCtx_t *c)
|
||||
{
|
||||
LockedAccessor_DeviceCrit_t crit(_criticalData);
|
||||
|
||||
crit->ctxs().remove(c);
|
||||
tprintf(DB_SYNC, " locked_removeContext: %s\n", ToString(c).c_str());
|
||||
}
|
||||
|
||||
|
||||
void ihipDevice_t::locked_reset()
|
||||
{
|
||||
// Obtain mutex access to the device critical data, release by destructor
|
||||
LockedAccessor_DeviceCrit_t crit(_criticalData);
|
||||
|
||||
|
||||
//---
|
||||
//Wait for pending activity to complete? TODO - check if this is required behavior:
|
||||
tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n");
|
||||
|
||||
// Reset and remove streams:
|
||||
// Delete all created streams including the default one.
|
||||
for (auto ctxI=crit->const_ctxs().begin(); ctxI!=crit->const_ctxs().end(); ctxI++) {
|
||||
ihipCtx_t *ctx = *ctxI;
|
||||
(*ctxI)->locked_reset();
|
||||
tprintf(DB_SYNC, " ctx cleanup %s\n", ToString(ctx).c_str());
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
// Clear the list.
|
||||
crit->ctxs().clear();
|
||||
|
||||
|
||||
//reset _primaryCtx
|
||||
_primaryCtx->locked_reset();
|
||||
tprintf(DB_SYNC, " _primaryCtx cleanup %s\n", ToString(_primaryCtx).c_str());
|
||||
// Reset and release all memory stored in the tracker:
|
||||
// Reset will remove peer mapping so don't need to do this explicitly.
|
||||
// FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count?
|
||||
|
||||
_state = 0;
|
||||
am_memtracker_reset(_acc);
|
||||
|
||||
};
|
||||
|
||||
#define ErrorCheck(x) error_check(x, __LINE__, __FILE__)
|
||||
|
||||
@@ -861,8 +912,14 @@ ihipCtx_t::ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags) :
|
||||
_device(device),
|
||||
_criticalData(this, deviceCnt)
|
||||
{
|
||||
locked_reset();
|
||||
//locked_reset();
|
||||
LockedAccessor_CtxCrit_t crit(_criticalData);
|
||||
_defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
|
||||
crit->addStream(_defaultStream);
|
||||
|
||||
|
||||
// Reset peer list to just me:
|
||||
crit->resetPeerWatchers(this);
|
||||
tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, ToString(_defaultStream).c_str());
|
||||
};
|
||||
|
||||
@@ -905,7 +962,7 @@ void ihipCtx_t::locked_reset()
|
||||
_defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
|
||||
crit->addStream(_defaultStream);
|
||||
|
||||
|
||||
#if 0
|
||||
// Reset peer list to just me:
|
||||
crit->resetPeerWatchers(this);
|
||||
|
||||
@@ -915,7 +972,7 @@ void ihipCtx_t::locked_reset()
|
||||
ihipDevice_t *device = getWriteableDevice();
|
||||
device->_state = 0;
|
||||
am_memtracker_reset(device->_acc);
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -143,6 +143,8 @@ extern const char *API_COLOR_END;
|
||||
|
||||
#define CTX_THREAD_SAFE 1
|
||||
|
||||
#define DEVICE_THREAD_SAFE 1
|
||||
|
||||
|
||||
// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
|
||||
// May be set to 0 to remove debug if checks - possible code size and performance difference?
|
||||
@@ -371,6 +373,13 @@ typedef FakeMutex StreamMutex;
|
||||
typedef std::mutex CtxMutex;
|
||||
#else
|
||||
typedef FakeMutex CtxMutex;
|
||||
#warning "Ctx thread-safe disabled"
|
||||
#endif
|
||||
|
||||
#if DEVICE_THREAD_SAFE
|
||||
typedef std::mutex DeviceMutex;
|
||||
#else
|
||||
typedef FakeMutex DeviceMutex;
|
||||
#warning "Device thread-safe disabled"
|
||||
#endif
|
||||
|
||||
@@ -595,7 +604,40 @@ struct ihipEvent_t {
|
||||
|
||||
|
||||
|
||||
//=============================================================================
|
||||
//class ihipDeviceCriticalBase_t
|
||||
template <typename MUTEX_TYPE>
|
||||
class ihipDeviceCriticalBase_t : LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
|
||||
_parent(parentDevice)
|
||||
{
|
||||
};
|
||||
|
||||
~ihipDeviceCriticalBase_t() {
|
||||
|
||||
}
|
||||
|
||||
// Contexts:
|
||||
void addContext(ihipCtx_t *ctx);
|
||||
void removeContext(ihipCtx_t *ctx);
|
||||
std::list<ihipCtx_t*> &ctxs() { return _ctxs; };
|
||||
const std::list<ihipCtx_t*> &const_ctxs() const { return _ctxs; };
|
||||
int getcount() {return _ctxCount;};
|
||||
friend class LockedAccessor<ihipDeviceCriticalBase_t>;
|
||||
private:
|
||||
ihipDevice_t *_parent;
|
||||
|
||||
//--- Context Tracker:
|
||||
std::list< ihipCtx_t* > _ctxs; // contexts associated with this device across all threads.
|
||||
|
||||
int _ctxCount;
|
||||
};
|
||||
|
||||
typedef ihipDeviceCriticalBase_t<DeviceMutex> ihipDeviceCritical_t;
|
||||
|
||||
typedef LockedAccessor<ihipDeviceCritical_t> LockedAccessor_DeviceCrit_t;
|
||||
|
||||
//----
|
||||
// Properties of the HIP device.
|
||||
@@ -608,7 +650,9 @@ public:
|
||||
|
||||
// Accessors:
|
||||
ihipCtx_t *getPrimaryCtx() const { return _primaryCtx; };
|
||||
|
||||
void locked_removeContext(ihipCtx_t *c);
|
||||
void locked_reset();
|
||||
ihipDeviceCritical_t &criticalData() { return _criticalData; };
|
||||
public:
|
||||
unsigned _deviceId; // device ID
|
||||
|
||||
@@ -628,6 +672,8 @@ public:
|
||||
|
||||
private:
|
||||
hipError_t initProperties(hipDeviceProp_t* prop);
|
||||
private:
|
||||
ihipDeviceCritical_t _criticalData;
|
||||
};
|
||||
//=============================================================================
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user