Changed StagingBuffer class to UnpinnedCopyEngine
Change-Id: I1e212bfc8030dcf225ecf78fd7b23fda9b1de92f
This commit is contained in:
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <hc.hpp>
|
||||
#include "hip/hcc_detail/hip_util.h"
|
||||
#include "hip/hcc_detail/staging_buffer.h"
|
||||
#include "hip/hcc_detail/unpinned_copy_engine.h"
|
||||
|
||||
|
||||
#if defined(__HCC__) && (__hcc_workweek__ < 16186)
|
||||
@@ -69,7 +69,7 @@ extern int HIP_DISABLE_HW_COPY_DEP;
|
||||
//---
|
||||
//Extern tls
|
||||
extern thread_local int tls_defaultDeviceId;
|
||||
extern thread_local ihipCtx_t *tls_defaultCtx;
|
||||
extern thread_local ihipCtx_t *tls_defaultCtx;
|
||||
|
||||
extern thread_local hipError_t tls_lastHipError;
|
||||
|
||||
@@ -101,11 +101,11 @@ class ihipCtx_t;
|
||||
|
||||
#define CTX_THREAD_SAFE 1
|
||||
|
||||
// If FORCE_COPY_DEP=1 , HIP runtime will add
|
||||
// If FORCE_COPY_DEP=1 , HIP runtime will add
|
||||
// synchronization for copy commands in the same stream, regardless of command type.
|
||||
// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered.
|
||||
// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
|
||||
// when using shader queues.
|
||||
// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
|
||||
// when using shader queues.
|
||||
// TODO - measure if this matters for performance, in particular for back-to-back small copies.
|
||||
// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands.
|
||||
#define FORCE_SAMEDIR_COPY_DEP 1
|
||||
@@ -121,7 +121,7 @@ class ihipCtx_t;
|
||||
// 0x2 = prints a simple message with function name + return code when function exits.
|
||||
// 0x3 = print both.
|
||||
// Must be enabled at runtime with HIP_TRACE_API
|
||||
#define COMPILE_HIP_TRACE_API 0x3
|
||||
#define COMPILE_HIP_TRACE_API 0x3
|
||||
|
||||
|
||||
// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
|
||||
@@ -141,13 +141,13 @@ class ihipCtx_t;
|
||||
#if COMPILE_HIP_ATP_MARKER
|
||||
#include "CXLActivityLogger.h"
|
||||
#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
|
||||
#else
|
||||
#else
|
||||
// Swallow scoped markers:
|
||||
#define SCOPED_MARKER(markerName,group,userString)
|
||||
#define SCOPED_MARKER(markerName,group,userString)
|
||||
#endif
|
||||
|
||||
|
||||
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
|
||||
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
|
||||
#define API_TRACE(...)\
|
||||
{\
|
||||
if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
|
||||
@@ -198,7 +198,7 @@ class ihipCtx_t;
|
||||
|
||||
static const char *dbName [] =
|
||||
{
|
||||
KNRM "hip-api", // not used,
|
||||
KNRM "hip-api", // not used,
|
||||
KYEL "hip-sync",
|
||||
KCYN "hip-mem",
|
||||
KMAG "hip-copy1",
|
||||
@@ -214,9 +214,9 @@ static const char *dbName [] =
|
||||
fprintf (stderr, "%s", KNRM); \
|
||||
}\
|
||||
}
|
||||
#else
|
||||
#else
|
||||
/* Compile to empty code */
|
||||
#define tprintf(trace_level, ...)
|
||||
#define tprintf(trace_level, ...)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -228,7 +228,7 @@ class ihipException : public std::exception
|
||||
public:
|
||||
ihipException(hipError_t e) : _code(e) {};
|
||||
|
||||
hipError_t _code;
|
||||
hipError_t _code;
|
||||
};
|
||||
|
||||
|
||||
@@ -246,7 +246,7 @@ const hipStream_t hipStreamNull = 0x0;
|
||||
|
||||
enum ihipCommand_t {
|
||||
ihipCommandCopyH2H,
|
||||
ihipCommandCopyH2D,
|
||||
ihipCommandCopyH2D,
|
||||
ihipCommandCopyD2H,
|
||||
ihipCommandCopyD2D,
|
||||
ihipCommandCopyP2P,
|
||||
@@ -311,7 +311,7 @@ template<typename T>
|
||||
class LockedAccessor
|
||||
{
|
||||
public:
|
||||
LockedAccessor(T &criticalData, bool autoUnlock=true) :
|
||||
LockedAccessor(T &criticalData, bool autoUnlock=true) :
|
||||
_criticalData(&criticalData),
|
||||
_autoUnlock(autoUnlock)
|
||||
|
||||
@@ -319,14 +319,14 @@ public:
|
||||
_criticalData->_mutex.lock();
|
||||
};
|
||||
|
||||
~LockedAccessor()
|
||||
~LockedAccessor()
|
||||
{
|
||||
if (_autoUnlock) {
|
||||
_criticalData->_mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void unlock()
|
||||
void unlock()
|
||||
{
|
||||
_criticalData->_mutex.unlock();
|
||||
}
|
||||
@@ -343,7 +343,7 @@ private:
|
||||
template <typename MUTEX_TYPE>
|
||||
struct LockedBase {
|
||||
|
||||
// Experts-only interface for explicit locking.
|
||||
// Experts-only interface for explicit locking.
|
||||
// Most uses should use the lock-accessor.
|
||||
void lock() { _mutex.lock(); }
|
||||
void unlock() { _mutex.unlock(); }
|
||||
@@ -352,8 +352,8 @@ struct LockedBase {
|
||||
};
|
||||
|
||||
|
||||
template <typename MUTEX_TYPE>
|
||||
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
|
||||
template <typename MUTEX_TYPE>
|
||||
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipStreamCriticalBase_t() :
|
||||
@@ -389,15 +389,15 @@ public:
|
||||
int _signalCursor;
|
||||
SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated.
|
||||
std::deque<ihipSignal_t> _signalPool; // Pool of signals for use by this stream.
|
||||
uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool.
|
||||
// Each copy may use 1-2 signals depending on command transitions:
|
||||
uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool.
|
||||
// Each copy may use 1-2 signals depending on command transitions:
|
||||
// 2 are required if a barrier packet is inserted.
|
||||
uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
|
||||
SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id.
|
||||
};
|
||||
|
||||
|
||||
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
|
||||
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
|
||||
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
|
||||
|
||||
|
||||
@@ -517,10 +517,10 @@ public:
|
||||
hsa_agent_t _hsaAgent; // hsa agent handle
|
||||
|
||||
//! Number of compute units supported by the device:
|
||||
unsigned _computeUnits;
|
||||
unsigned _computeUnits;
|
||||
hipDeviceProp_t _props; // saved device properties.
|
||||
|
||||
StagingBuffer *_stagingBuffer[2]; // one buffer for each direction.
|
||||
|
||||
UnpinnedCopyEngine *_stagingBuffer[2]; // one buffer for each direction.
|
||||
int _isLargeBar;
|
||||
|
||||
ihipCtx_t *_primaryCtx;
|
||||
@@ -538,8 +538,8 @@ template <typename MUTEX_TYPE>
|
||||
class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipCtxCriticalBase_t(unsigned deviceCnt) :
|
||||
_peerCnt(0)
|
||||
ihipCtxCriticalBase_t(unsigned deviceCnt) :
|
||||
_peerCnt(0)
|
||||
{
|
||||
_peerAgents = new hsa_agent_t[deviceCnt];
|
||||
};
|
||||
@@ -580,12 +580,12 @@ private:
|
||||
// Enabled peers have permissions to access the memory physically allocated on this device.
|
||||
std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
|
||||
uint32_t _peerCnt; // number of enabled peers
|
||||
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
|
||||
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
|
||||
private:
|
||||
void recomputePeerAgents();
|
||||
};
|
||||
// Note Mutex type Real/Fake selected based on CtxMutex
|
||||
typedef ihipCtxCriticalBase_t<CtxMutex> ihipCtxCritical_t;
|
||||
typedef ihipCtxCriticalBase_t<CtxMutex> ihipCtxCritical_t;
|
||||
|
||||
// This type is used by functions that need access to the critical device structures.
|
||||
typedef LockedAccessor<ihipCtxCritical_t> LockedAccessor_CtxCrit_t;
|
||||
@@ -594,19 +594,19 @@ typedef LockedAccessor<ihipCtxCritical_t> LockedAccessor_CtxCrit_t;
|
||||
|
||||
//=============================================================================
|
||||
//class ihipCtx_t:
|
||||
// A HIP CTX (context) points at one of the existing devices and contains the streams,
|
||||
// A HIP CTX (context) points at one of the existing devices and contains the streams,
|
||||
// peer-to-peer mappings, creation flags. Multiple contexts can point to the same
|
||||
// device.
|
||||
//
|
||||
class ihipCtx_t
|
||||
{
|
||||
public: // Functions:
|
||||
ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
|
||||
ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
|
||||
~ihipCtx_t();
|
||||
|
||||
// Functions which read or write the critical data are named locked_.
|
||||
// ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
|
||||
// External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
|
||||
// External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
|
||||
// performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
|
||||
void locked_addStream(ihipStream_t *s);
|
||||
void locked_removeStream(ihipStream_t *s);
|
||||
@@ -668,7 +668,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t);
|
||||
inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
|
||||
{
|
||||
os << "stream#";
|
||||
os << s.getDevice()->_deviceId;;
|
||||
os << s.getDevice()->_deviceId;;
|
||||
os << '.';
|
||||
os << s._id;
|
||||
return os;
|
||||
|
||||
@@ -37,12 +37,12 @@ THE SOFTWARE.
|
||||
// engine. This routine is under development.
|
||||
//
|
||||
// Staging buffer provides thread-safe access via a mutex.
|
||||
struct StagingBuffer {
|
||||
struct UnpinnedCopyEngine {
|
||||
|
||||
static const int _max_buffers = 4;
|
||||
|
||||
StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ;
|
||||
~StagingBuffer();
|
||||
UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ;
|
||||
~UnpinnedCopyEngine();
|
||||
|
||||
void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
+6
-6
@@ -85,7 +85,7 @@ int HIP_DISABLE_HW_COPY_DEP = 0;
|
||||
|
||||
std::once_flag hip_initialized;
|
||||
|
||||
// Array of pointers to devices.
|
||||
// Array of pointers to devices.
|
||||
ihipDevice_t **g_deviceArray;
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ thread_local int tls_defaultDeviceId = 0;
|
||||
|
||||
// This is the implicit context used by all HIP commands.
|
||||
// It can be set by hipSetDevice or by the CTX manipulation commands:
|
||||
thread_local ihipCtx_t *tls_defaultCtx;
|
||||
thread_local ihipCtx_t *tls_defaultCtx;
|
||||
|
||||
thread_local hipError_t tls_lastHipError = hipSuccess;
|
||||
|
||||
@@ -550,7 +550,7 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
// ihipDevice_t
|
||||
//=================================================================================================
|
||||
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
|
||||
_deviceId(deviceId),
|
||||
_deviceId(deviceId),
|
||||
_acc(acc)
|
||||
{
|
||||
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
|
||||
@@ -567,8 +567,8 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato
|
||||
|
||||
initProperties(&_props);
|
||||
|
||||
_stagingBuffer[0] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
|
||||
_stagingBuffer[1] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
|
||||
_stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
|
||||
_stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
|
||||
|
||||
_primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost);
|
||||
}
|
||||
@@ -1832,5 +1832,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
|
||||
//// TODO - add identifier numbers for streams and devices to help with debugging.
|
||||
|
||||
#if ONE_OBJECT_FILE
|
||||
#include "staging_buffer.cpp"
|
||||
#include "unpinned_copy_engine.cpp"
|
||||
#endif
|
||||
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "hsa_ext_amd.h"
|
||||
|
||||
#include "hcc_detail/staging_buffer.h"
|
||||
#include "hcc_detail/unpinned_copy_engine.h"
|
||||
|
||||
#ifdef HIP_HCC
|
||||
#include "hcc_detail/hip_runtime.h"
|
||||
@@ -62,7 +62,7 @@ hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) :
|
||||
UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) :
|
||||
_hsaAgent(hsaAgent),
|
||||
_cpuAgent(cpuAgent),
|
||||
_bufferSize(bufferSize),
|
||||
@@ -93,7 +93,7 @@ StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t
|
||||
|
||||
|
||||
//---
|
||||
StagingBuffer::~StagingBuffer()
|
||||
UnpinnedCopyEngine::~UnpinnedCopyEngine()
|
||||
{
|
||||
for (int i=0; i<_numBuffers; i++) {
|
||||
if (_pinnedStagingBuffer[i]) {
|
||||
@@ -112,7 +112,7 @@ StagingBuffer::~StagingBuffer()
|
||||
//IN: dst - dest pointer - must be accessible from host CPU.
|
||||
//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent)
|
||||
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
|
||||
void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
std::lock_guard<std::mutex> l (_copy_lock);
|
||||
|
||||
@@ -166,7 +166,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
|
||||
//IN: dst - dest pointer - must be accessible from host CPU.
|
||||
//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent)
|
||||
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
|
||||
void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){
|
||||
memcpy(dst,src,sizeBytes);
|
||||
@@ -227,7 +227,7 @@ void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, con
|
||||
}
|
||||
|
||||
|
||||
void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
std::lock_guard<std::mutex> l (_copy_lock);
|
||||
|
||||
@@ -272,7 +272,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_
|
||||
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
|
||||
//IN: src - src pointer for copy. Must be accessible from host CPU.
|
||||
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
|
||||
void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){
|
||||
CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor);
|
||||
@@ -339,7 +339,7 @@ void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, s
|
||||
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
|
||||
//IN: src - src pointer for copy. Must be accessible from host CPU.
|
||||
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
|
||||
void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
std::lock_guard<std::mutex> l (_copy_lock);
|
||||
|
||||
مرجع در شماره جدید
Block a user