diff --git a/projects/clr/hipamd/src/hip_context.cpp b/projects/clr/hipamd/src/hip_context.cpp index 6c862b114b..835cf6e795 100644 --- a/projects/clr/hipamd/src/hip_context.cpp +++ b/projects/clr/hipamd/src/hip_context.cpp @@ -58,9 +58,15 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init hipError_t e = hipSuccess; auto deviceHandle = ihipGetDevice(device); - *ctx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags); - ihipSetTlsDefaultCtx(*ctx); - tls_ctxStack.push(*ctx); + { + // Obtain mutex access to the device critical data, release by destructor + LockedAccessor_DeviceCrit_t deviceCrit(deviceHandle->criticalData()); + auto ictx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags); + *ctx = ictx; + ihipSetTlsDefaultCtx(*ctx); + tls_ctxStack.push(*ctx); + deviceCrit->addContext(ictx); + } return ihipLogStatus(e); } @@ -125,6 +131,11 @@ hipError_t hipCtxDestroy(hipCtx_t ctx) //need to destroy the ctx associated with calling thread tls_ctxStack.pop(); } + { + auto deviceHandle = ctx->getWriteableDevice(); + deviceHandle->locked_removeContext(ctx); + ctx->locked_reset(); + } delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED. } diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 3cefca82e7..d3c68e6fdf 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -166,11 +166,16 @@ hipError_t hipDeviceReset(void) // This function currently does a user-level cleanup of known resources. // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. - +#if 0 if (ctx) { // Release ctx resources (streams and memory): ctx->locked_reset(); } +#endif + if (ctx) { + ihipDevice_t *deviceHandle = ctx->getWriteableDevice(); + deviceHandle->locked_reset(); + } return ihipLogStatus(hipSuccess); } diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index efc0265ba8..a294cc0097 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -522,6 +522,14 @@ void ihipCtxCriticalBase_t::addStream(ihipStream_t *stream) _streams.push_back(stream); tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str()); } + +template<> +void ihipDeviceCriticalBase_t::addContext(ihipCtx_t *ctx) +{ + _ctxs.push_back(ctx); + tprintf(DB_SYNC, " addContext: %s\n", ToString(ctx).c_str()); +} + //============================================================================= //================================================================================================= @@ -530,7 +538,8 @@ void ihipCtxCriticalBase_t::addStream(ihipStream_t *stream) ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) : _deviceId(deviceId), _acc(acc), - _state(0) + _state(0), + _criticalData(this) { hsa_agent_t *agent = static_cast (acc.get_hsa_agent()); if (agent) { @@ -557,7 +566,49 @@ ihipDevice_t::~ihipDevice_t() _primaryCtx = NULL; } +void ihipDevice_t::locked_removeContext(ihipCtx_t *c) +{ + LockedAccessor_DeviceCrit_t crit(_criticalData); + crit->ctxs().remove(c); + tprintf(DB_SYNC, " locked_removeContext: %s\n", ToString(c).c_str()); +} + + +void ihipDevice_t::locked_reset() +{ + // Obtain mutex access to the device critical data, release by destructor + LockedAccessor_DeviceCrit_t crit(_criticalData); + + + //--- + //Wait for pending activity to complete? TODO - check if this is required behavior: + tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n"); + + // Reset and remove streams: + // Delete all created streams including the default one. + for (auto ctxI=crit->const_ctxs().begin(); ctxI!=crit->const_ctxs().end(); ctxI++) { + ihipCtx_t *ctx = *ctxI; + (*ctxI)->locked_reset(); + tprintf(DB_SYNC, " ctx cleanup %s\n", ToString(ctx).c_str()); + + delete ctx; + } + // Clear the list. + crit->ctxs().clear(); + + + //reset _primaryCtx + _primaryCtx->locked_reset(); + tprintf(DB_SYNC, " _primaryCtx cleanup %s\n", ToString(_primaryCtx).c_str()); + // Reset and release all memory stored in the tracker: + // Reset will remove peer mapping so don't need to do this explicitly. + // FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count? + + _state = 0; + am_memtracker_reset(_acc); + +}; #define ErrorCheck(x) error_check(x, __LINE__, __FILE__) @@ -861,8 +912,14 @@ ihipCtx_t::ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags) : _device(device), _criticalData(this, deviceCnt) { - locked_reset(); + //locked_reset(); + LockedAccessor_CtxCrit_t crit(_criticalData); + _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault); + crit->addStream(_defaultStream); + + // Reset peer list to just me: + crit->resetPeerWatchers(this); tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, ToString(_defaultStream).c_str()); }; @@ -905,7 +962,7 @@ void ihipCtx_t::locked_reset() _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault); crit->addStream(_defaultStream); - +#if 0 // Reset peer list to just me: crit->resetPeerWatchers(this); @@ -915,7 +972,7 @@ void ihipCtx_t::locked_reset() ihipDevice_t *device = getWriteableDevice(); device->_state = 0; am_memtracker_reset(device->_acc); - +#endif }; diff --git a/projects/clr/hipamd/src/hip_hcc.h b/projects/clr/hipamd/src/hip_hcc.h index 9ebac73cad..105eef6bb8 100644 --- a/projects/clr/hipamd/src/hip_hcc.h +++ b/projects/clr/hipamd/src/hip_hcc.h @@ -143,6 +143,8 @@ extern const char *API_COLOR_END; #define CTX_THREAD_SAFE 1 +#define DEVICE_THREAD_SAFE 1 + // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. // May be set to 0 to remove debug if checks - possible code size and performance difference? @@ -371,6 +373,13 @@ typedef FakeMutex StreamMutex; typedef std::mutex CtxMutex; #else typedef FakeMutex CtxMutex; +#warning "Ctx thread-safe disabled" +#endif + +#if DEVICE_THREAD_SAFE +typedef std::mutex DeviceMutex; +#else +typedef FakeMutex DeviceMutex; #warning "Device thread-safe disabled" #endif @@ -595,7 +604,40 @@ struct ihipEvent_t { +//============================================================================= +//class ihipDeviceCriticalBase_t +template +class ihipDeviceCriticalBase_t : LockedBase +{ +public: + ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) : + _parent(parentDevice) + { + }; + ~ihipDeviceCriticalBase_t() { + + } + + // Contexts: + void addContext(ihipCtx_t *ctx); + void removeContext(ihipCtx_t *ctx); + std::list &ctxs() { return _ctxs; }; + const std::list &const_ctxs() const { return _ctxs; }; + int getcount() {return _ctxCount;}; + friend class LockedAccessor; +private: + ihipDevice_t *_parent; + + //--- Context Tracker: + std::list< ihipCtx_t* > _ctxs; // contexts associated with this device across all threads. + + int _ctxCount; +}; + +typedef ihipDeviceCriticalBase_t ihipDeviceCritical_t; + +typedef LockedAccessor LockedAccessor_DeviceCrit_t; //---- // Properties of the HIP device. @@ -608,7 +650,9 @@ public: // Accessors: ihipCtx_t *getPrimaryCtx() const { return _primaryCtx; }; - + void locked_removeContext(ihipCtx_t *c); + void locked_reset(); + ihipDeviceCritical_t &criticalData() { return _criticalData; }; public: unsigned _deviceId; // device ID @@ -628,6 +672,8 @@ public: private: hipError_t initProperties(hipDeviceProp_t* prop); +private: + ihipDeviceCritical_t _criticalData; }; //=============================================================================