diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 933cff16e2..dcff9bd61e 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -22,7 +22,7 @@ THE SOFTWARE. #include #include "hip/hcc_detail/hip_util.h" -#include "hip/hcc_detail/staging_buffer.h" +#include "hip/hcc_detail/unpinned_copy_engine.h" #if defined(__HCC__) && (__hcc_workweek__ < 16186) @@ -69,7 +69,7 @@ extern int HIP_DISABLE_HW_COPY_DEP; //--- //Extern tls extern thread_local int tls_defaultDeviceId; -extern thread_local ihipCtx_t *tls_defaultCtx; +extern thread_local ihipCtx_t *tls_defaultCtx; extern thread_local hipError_t tls_lastHipError; @@ -101,11 +101,11 @@ class ihipCtx_t; #define CTX_THREAD_SAFE 1 -// If FORCE_COPY_DEP=1 , HIP runtime will add +// If FORCE_COPY_DEP=1 , HIP runtime will add // synchronization for copy commands in the same stream, regardless of command type. // If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. -// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not -// when using shader queues. +// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not +// when using shader queues. // TODO - measure if this matters for performance, in particular for back-to-back small copies. // If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. #define FORCE_SAMEDIR_COPY_DEP 1 @@ -121,7 +121,7 @@ class ihipCtx_t; // 0x2 = prints a simple message with function name + return code when function exits. // 0x3 = print both. // Must be enabled at runtime with HIP_TRACE_API -#define COMPILE_HIP_TRACE_API 0x3 +#define COMPILE_HIP_TRACE_API 0x3 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. @@ -141,13 +141,13 @@ class ihipCtx_t; #if COMPILE_HIP_ATP_MARKER #include "CXLActivityLogger.h" #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) -#else +#else // Swallow scoped markers: -#define SCOPED_MARKER(markerName,group,userString) +#define SCOPED_MARKER(markerName,group,userString) #endif -#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) +#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(...)\ {\ if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ @@ -198,7 +198,7 @@ class ihipCtx_t; static const char *dbName [] = { - KNRM "hip-api", // not used, + KNRM "hip-api", // not used, KYEL "hip-sync", KCYN "hip-mem", KMAG "hip-copy1", @@ -214,9 +214,9 @@ static const char *dbName [] = fprintf (stderr, "%s", KNRM); \ }\ } -#else +#else /* Compile to empty code */ -#define tprintf(trace_level, ...) +#define tprintf(trace_level, ...) #endif @@ -228,7 +228,7 @@ class ihipException : public std::exception public: ihipException(hipError_t e) : _code(e) {}; - hipError_t _code; + hipError_t _code; }; @@ -246,7 +246,7 @@ const hipStream_t hipStreamNull = 0x0; enum ihipCommand_t { ihipCommandCopyH2H, - ihipCommandCopyH2D, + ihipCommandCopyH2D, ihipCommandCopyD2H, ihipCommandCopyD2D, ihipCommandCopyP2P, @@ -311,7 +311,7 @@ template class LockedAccessor { public: - LockedAccessor(T &criticalData, bool autoUnlock=true) : + LockedAccessor(T &criticalData, bool autoUnlock=true) : _criticalData(&criticalData), _autoUnlock(autoUnlock) @@ -319,14 +319,14 @@ public: _criticalData->_mutex.lock(); }; - ~LockedAccessor() + ~LockedAccessor() { if (_autoUnlock) { _criticalData->_mutex.unlock(); } } - void unlock() + void unlock() { _criticalData->_mutex.unlock(); } @@ -343,7 +343,7 @@ private: template struct LockedBase { - // Experts-only interface for explicit locking. + // Experts-only interface for explicit locking. // Most uses should use the lock-accessor. void lock() { _mutex.lock(); } void unlock() { _mutex.unlock(); } @@ -352,8 +352,8 @@ struct LockedBase { }; -template -class ihipStreamCriticalBase_t : public LockedBase +template +class ihipStreamCriticalBase_t : public LockedBase { public: ihipStreamCriticalBase_t() : @@ -389,15 +389,15 @@ public: int _signalCursor; SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. std::deque _signalPool; // Pool of signals for use by this stream. - uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool. - // Each copy may use 1-2 signals depending on command transitions: + uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool. + // Each copy may use 1-2 signals depending on command transitions: // 2 are required if a barrier packet is inserted. uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id. }; -typedef ihipStreamCriticalBase_t ihipStreamCritical_t; +typedef ihipStreamCriticalBase_t ihipStreamCritical_t; typedef LockedAccessor LockedAccessor_StreamCrit_t; @@ -517,10 +517,10 @@ public: hsa_agent_t _hsaAgent; // hsa agent handle //! Number of compute units supported by the device: - unsigned _computeUnits; + unsigned _computeUnits; hipDeviceProp_t _props; // saved device properties. - - StagingBuffer *_stagingBuffer[2]; // one buffer for each direction. + + UnpinnedCopyEngine *_stagingBuffer[2]; // one buffer for each direction. int _isLargeBar; ihipCtx_t *_primaryCtx; @@ -538,8 +538,8 @@ template class ihipCtxCriticalBase_t : LockedBase { public: - ihipCtxCriticalBase_t(unsigned deviceCnt) : - _peerCnt(0) + ihipCtxCriticalBase_t(unsigned deviceCnt) : + _peerCnt(0) { _peerAgents = new hsa_agent_t[deviceCnt]; }; @@ -580,12 +580,12 @@ private: // Enabled peers have permissions to access the memory physically allocated on this device. std::list _peers; // list of enabled peer devices. uint32_t _peerCnt; // number of enabled peers - hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) + hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) private: void recomputePeerAgents(); }; // Note Mutex type Real/Fake selected based on CtxMutex -typedef ihipCtxCriticalBase_t ihipCtxCritical_t; +typedef ihipCtxCriticalBase_t ihipCtxCritical_t; // This type is used by functions that need access to the critical device structures. typedef LockedAccessor LockedAccessor_CtxCrit_t; @@ -594,19 +594,19 @@ typedef LockedAccessor LockedAccessor_CtxCrit_t; //============================================================================= //class ihipCtx_t: -// A HIP CTX (context) points at one of the existing devices and contains the streams, +// A HIP CTX (context) points at one of the existing devices and contains the streams, // peer-to-peer mappings, creation flags. Multiple contexts can point to the same // device. // class ihipCtx_t { public: // Functions: - ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData + ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData ~ihipCtx_t(); // Functions which read or write the critical data are named locked_. // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function. - // External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in + // External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in // performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all. void locked_addStream(ihipStream_t *s); void locked_removeStream(ihipStream_t *s); @@ -668,7 +668,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t); inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) { os << "stream#"; - os << s.getDevice()->_deviceId;; + os << s.getDevice()->_deviceId;; os << '.'; os << s._id; return os; diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/unpinned_copy_engine.h similarity index 93% rename from include/hcc_detail/staging_buffer.h rename to include/hcc_detail/unpinned_copy_engine.h index fe28a93e16..2dd7e15d28 100644 --- a/include/hcc_detail/staging_buffer.h +++ b/include/hcc_detail/unpinned_copy_engine.h @@ -37,12 +37,12 @@ THE SOFTWARE. // engine. This routine is under development. // // Staging buffer provides thread-safe access via a mutex. -struct StagingBuffer { +struct UnpinnedCopyEngine { static const int _max_buffers = 4; - StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; - ~StagingBuffer(); + UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; + ~UnpinnedCopyEngine(); void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 82375c7754..0d85815d28 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -85,7 +85,7 @@ int HIP_DISABLE_HW_COPY_DEP = 0; std::once_flag hip_initialized; -// Array of pointers to devices. +// Array of pointers to devices. ihipDevice_t **g_deviceArray; @@ -106,7 +106,7 @@ thread_local int tls_defaultDeviceId = 0; // This is the implicit context used by all HIP commands. // It can be set by hipSetDevice or by the CTX manipulation commands: -thread_local ihipCtx_t *tls_defaultCtx; +thread_local ihipCtx_t *tls_defaultCtx; thread_local hipError_t tls_lastHipError = hipSuccess; @@ -550,7 +550,7 @@ void ihipCtxCriticalBase_t::addStream(ihipStream_t *stream) // ihipDevice_t //================================================================================================= ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) : - _deviceId(deviceId), + _deviceId(deviceId), _acc(acc) { hsa_agent_t *agent = static_cast (acc.get_hsa_agent()); @@ -567,8 +567,8 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato initProperties(&_props); - _stagingBuffer[0] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); - _stagingBuffer[1] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); + _stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); + _stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost); } @@ -1832,5 +1832,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a //// TODO - add identifier numbers for streams and devices to help with debugging. #if ONE_OBJECT_FILE -#include "staging_buffer.cpp" +#include "unpinned_copy_engine.cpp" #endif diff --git a/src/staging_buffer.cpp b/src/unpinned_copy_engine.cpp similarity index 95% rename from src/staging_buffer.cpp rename to src/unpinned_copy_engine.cpp index 9ce5722559..5501c66f9d 100644 --- a/src/staging_buffer.cpp +++ b/src/unpinned_copy_engine.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. #include "hsa_ext_amd.h" -#include "hcc_detail/staging_buffer.h" +#include "hcc_detail/unpinned_copy_engine.h" #ifdef HIP_HCC #include "hcc_detail/hip_runtime.h" @@ -62,7 +62,7 @@ hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) { } //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) : +UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) : _hsaAgent(hsaAgent), _cpuAgent(cpuAgent), _bufferSize(bufferSize), @@ -93,7 +93,7 @@ StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t //--- -StagingBuffer::~StagingBuffer() +UnpinnedCopyEngine::~UnpinnedCopyEngine() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { @@ -112,7 +112,7 @@ StagingBuffer::~StagingBuffer() //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard l (_copy_lock); @@ -166,7 +166,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){ memcpy(dst,src,sizeBytes); @@ -227,7 +227,7 @@ void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, con } -void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard l (_copy_lock); @@ -272,7 +272,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_ //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){ CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); @@ -339,7 +339,7 @@ void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, s //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard l (_copy_lock);