Context management related changes in HIP.

-
-Contexts across threads are listed under device
-Device reset cleans up all contexts and re-initializes _primaryCtx

Change-Id: Ie1cfbb26d43a8dc6869be3e6ebaf7344ce374643


[ROCm/clr commit: c837b8d713]
Этот коммит содержится в:
Rahul Garg
2017-02-27 15:17:11 +05:30
родитель 37ab88ae1e
Коммит b4d190a445
4 изменённых файлов: 128 добавлений и 9 удалений
+14 -3
Просмотреть файл
@@ -58,9 +58,15 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
hipError_t e = hipSuccess;
auto deviceHandle = ihipGetDevice(device);
*ctx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags);
ihipSetTlsDefaultCtx(*ctx);
tls_ctxStack.push(*ctx);
{
// Obtain mutex access to the device critical data, release by destructor
LockedAccessor_DeviceCrit_t deviceCrit(deviceHandle->criticalData());
auto ictx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags);
*ctx = ictx;
ihipSetTlsDefaultCtx(*ctx);
tls_ctxStack.push(*ctx);
deviceCrit->addContext(ictx);
}
return ihipLogStatus(e);
}
@@ -125,6 +131,11 @@ hipError_t hipCtxDestroy(hipCtx_t ctx)
//need to destroy the ctx associated with calling thread
tls_ctxStack.pop();
}
{
auto deviceHandle = ctx->getWriteableDevice();
deviceHandle->locked_removeContext(ctx);
ctx->locked_reset();
}
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
}
+6 -1
Просмотреть файл
@@ -166,11 +166,16 @@ hipError_t hipDeviceReset(void)
// This function currently does a user-level cleanup of known resources.
// It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries.
#if 0
if (ctx) {
// Release ctx resources (streams and memory):
ctx->locked_reset();
}
#endif
if (ctx) {
ihipDevice_t *deviceHandle = ctx->getWriteableDevice();
deviceHandle->locked_reset();
}
return ihipLogStatus(hipSuccess);
}
+61 -4
Просмотреть файл
@@ -522,6 +522,14 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
_streams.push_back(stream);
tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str());
}
template<>
void ihipDeviceCriticalBase_t<DeviceMutex>::addContext(ihipCtx_t *ctx)
{
_ctxs.push_back(ctx);
tprintf(DB_SYNC, " addContext: %s\n", ToString(ctx).c_str());
}
//=============================================================================
//=================================================================================================
@@ -530,7 +538,8 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
_deviceId(deviceId),
_acc(acc),
_state(0)
_state(0),
_criticalData(this)
{
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
if (agent) {
@@ -557,7 +566,49 @@ ihipDevice_t::~ihipDevice_t()
_primaryCtx = NULL;
}
void ihipDevice_t::locked_removeContext(ihipCtx_t *c)
{
LockedAccessor_DeviceCrit_t crit(_criticalData);
crit->ctxs().remove(c);
tprintf(DB_SYNC, " locked_removeContext: %s\n", ToString(c).c_str());
}
void ihipDevice_t::locked_reset()
{
// Obtain mutex access to the device critical data, release by destructor
LockedAccessor_DeviceCrit_t crit(_criticalData);
//---
//Wait for pending activity to complete? TODO - check if this is required behavior:
tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n");
// Reset and remove streams:
// Delete all created streams including the default one.
for (auto ctxI=crit->const_ctxs().begin(); ctxI!=crit->const_ctxs().end(); ctxI++) {
ihipCtx_t *ctx = *ctxI;
(*ctxI)->locked_reset();
tprintf(DB_SYNC, " ctx cleanup %s\n", ToString(ctx).c_str());
delete ctx;
}
// Clear the list.
crit->ctxs().clear();
//reset _primaryCtx
_primaryCtx->locked_reset();
tprintf(DB_SYNC, " _primaryCtx cleanup %s\n", ToString(_primaryCtx).c_str());
// Reset and release all memory stored in the tracker:
// Reset will remove peer mapping so don't need to do this explicitly.
// FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count?
_state = 0;
am_memtracker_reset(_acc);
};
#define ErrorCheck(x) error_check(x, __LINE__, __FILE__)
@@ -861,8 +912,14 @@ ihipCtx_t::ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags) :
_device(device),
_criticalData(this, deviceCnt)
{
locked_reset();
//locked_reset();
LockedAccessor_CtxCrit_t crit(_criticalData);
_defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
crit->addStream(_defaultStream);
// Reset peer list to just me:
crit->resetPeerWatchers(this);
tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, ToString(_defaultStream).c_str());
};
@@ -905,7 +962,7 @@ void ihipCtx_t::locked_reset()
_defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
crit->addStream(_defaultStream);
#if 0
// Reset peer list to just me:
crit->resetPeerWatchers(this);
@@ -915,7 +972,7 @@ void ihipCtx_t::locked_reset()
ihipDevice_t *device = getWriteableDevice();
device->_state = 0;
am_memtracker_reset(device->_acc);
#endif
};
+47 -1
Просмотреть файл
@@ -143,6 +143,8 @@ extern const char *API_COLOR_END;
#define CTX_THREAD_SAFE 1
#define DEVICE_THREAD_SAFE 1
// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
// May be set to 0 to remove debug if checks - possible code size and performance difference?
@@ -371,6 +373,13 @@ typedef FakeMutex StreamMutex;
typedef std::mutex CtxMutex;
#else
typedef FakeMutex CtxMutex;
#warning "Ctx thread-safe disabled"
#endif
#if DEVICE_THREAD_SAFE
typedef std::mutex DeviceMutex;
#else
typedef FakeMutex DeviceMutex;
#warning "Device thread-safe disabled"
#endif
@@ -595,7 +604,40 @@ struct ihipEvent_t {
//=============================================================================
//class ihipDeviceCriticalBase_t
template <typename MUTEX_TYPE>
class ihipDeviceCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
_parent(parentDevice)
{
};
~ihipDeviceCriticalBase_t() {
}
// Contexts:
void addContext(ihipCtx_t *ctx);
void removeContext(ihipCtx_t *ctx);
std::list<ihipCtx_t*> &ctxs() { return _ctxs; };
const std::list<ihipCtx_t*> &const_ctxs() const { return _ctxs; };
int getcount() {return _ctxCount;};
friend class LockedAccessor<ihipDeviceCriticalBase_t>;
private:
ihipDevice_t *_parent;
//--- Context Tracker:
std::list< ihipCtx_t* > _ctxs; // contexts associated with this device across all threads.
int _ctxCount;
};
typedef ihipDeviceCriticalBase_t<DeviceMutex> ihipDeviceCritical_t;
typedef LockedAccessor<ihipDeviceCritical_t> LockedAccessor_DeviceCrit_t;
//----
// Properties of the HIP device.
@@ -608,7 +650,9 @@ public:
// Accessors:
ihipCtx_t *getPrimaryCtx() const { return _primaryCtx; };
void locked_removeContext(ihipCtx_t *c);
void locked_reset();
ihipDeviceCritical_t &criticalData() { return _criticalData; };
public:
unsigned _deviceId; // device ID
@@ -628,6 +672,8 @@ public:
private:
hipError_t initProperties(hipDeviceProp_t* prop);
private:
ihipDeviceCritical_t _criticalData;
};
//=============================================================================