Changed StagingBuffer class to UnpinnedCopyEngine

Change-Id: I1e212bfc8030dcf225ecf78fd7b23fda9b1de92f
This commit is contained in:
Rahul Garg
2016-08-09 21:29:42 +05:30
والد 023b1ecf33
کامیت 2ac93c340d
4فایلهای تغییر یافته به همراه51 افزوده شده و 51 حذف شده
+34 -34
مشاهده پرونده
@@ -22,7 +22,7 @@ THE SOFTWARE.
#include <hc.hpp>
#include "hip/hcc_detail/hip_util.h"
#include "hip/hcc_detail/staging_buffer.h"
#include "hip/hcc_detail/unpinned_copy_engine.h"
#if defined(__HCC__) && (__hcc_workweek__ < 16186)
@@ -69,7 +69,7 @@ extern int HIP_DISABLE_HW_COPY_DEP;
//---
//Extern tls
extern thread_local int tls_defaultDeviceId;
extern thread_local ihipCtx_t *tls_defaultCtx;
extern thread_local ihipCtx_t *tls_defaultCtx;
extern thread_local hipError_t tls_lastHipError;
@@ -101,11 +101,11 @@ class ihipCtx_t;
#define CTX_THREAD_SAFE 1
// If FORCE_COPY_DEP=1 , HIP runtime will add
// If FORCE_COPY_DEP=1 , HIP runtime will add
// synchronization for copy commands in the same stream, regardless of command type.
// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered.
// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
// when using shader queues.
// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
// when using shader queues.
// TODO - measure if this matters for performance, in particular for back-to-back small copies.
// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands.
#define FORCE_SAMEDIR_COPY_DEP 1
@@ -121,7 +121,7 @@ class ihipCtx_t;
// 0x2 = prints a simple message with function name + return code when function exits.
// 0x3 = print both.
// Must be enabled at runtime with HIP_TRACE_API
#define COMPILE_HIP_TRACE_API 0x3
#define COMPILE_HIP_TRACE_API 0x3
// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
@@ -141,13 +141,13 @@ class ihipCtx_t;
#if COMPILE_HIP_ATP_MARKER
#include "CXLActivityLogger.h"
#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
#else
#else
// Swallow scoped markers:
#define SCOPED_MARKER(markerName,group,userString)
#define SCOPED_MARKER(markerName,group,userString)
#endif
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#define API_TRACE(...)\
{\
if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
@@ -198,7 +198,7 @@ class ihipCtx_t;
static const char *dbName [] =
{
KNRM "hip-api", // not used,
KNRM "hip-api", // not used,
KYEL "hip-sync",
KCYN "hip-mem",
KMAG "hip-copy1",
@@ -214,9 +214,9 @@ static const char *dbName [] =
fprintf (stderr, "%s", KNRM); \
}\
}
#else
#else
/* Compile to empty code */
#define tprintf(trace_level, ...)
#define tprintf(trace_level, ...)
#endif
@@ -228,7 +228,7 @@ class ihipException : public std::exception
public:
ihipException(hipError_t e) : _code(e) {};
hipError_t _code;
hipError_t _code;
};
@@ -246,7 +246,7 @@ const hipStream_t hipStreamNull = 0x0;
enum ihipCommand_t {
ihipCommandCopyH2H,
ihipCommandCopyH2D,
ihipCommandCopyH2D,
ihipCommandCopyD2H,
ihipCommandCopyD2D,
ihipCommandCopyP2P,
@@ -311,7 +311,7 @@ template<typename T>
class LockedAccessor
{
public:
LockedAccessor(T &criticalData, bool autoUnlock=true) :
LockedAccessor(T &criticalData, bool autoUnlock=true) :
_criticalData(&criticalData),
_autoUnlock(autoUnlock)
@@ -319,14 +319,14 @@ public:
_criticalData->_mutex.lock();
};
~LockedAccessor()
~LockedAccessor()
{
if (_autoUnlock) {
_criticalData->_mutex.unlock();
}
}
void unlock()
void unlock()
{
_criticalData->_mutex.unlock();
}
@@ -343,7 +343,7 @@ private:
template <typename MUTEX_TYPE>
struct LockedBase {
// Experts-only interface for explicit locking.
// Experts-only interface for explicit locking.
// Most uses should use the lock-accessor.
void lock() { _mutex.lock(); }
void unlock() { _mutex.unlock(); }
@@ -352,8 +352,8 @@ struct LockedBase {
};
template <typename MUTEX_TYPE>
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
template <typename MUTEX_TYPE>
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
{
public:
ihipStreamCriticalBase_t() :
@@ -389,15 +389,15 @@ public:
int _signalCursor;
SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated.
std::deque<ihipSignal_t> _signalPool; // Pool of signals for use by this stream.
uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool.
// Each copy may use 1-2 signals depending on command transitions:
uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool.
// Each copy may use 1-2 signals depending on command transitions:
// 2 are required if a barrier packet is inserted.
uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id.
};
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
@@ -517,10 +517,10 @@ public:
hsa_agent_t _hsaAgent; // hsa agent handle
//! Number of compute units supported by the device:
unsigned _computeUnits;
unsigned _computeUnits;
hipDeviceProp_t _props; // saved device properties.
StagingBuffer *_stagingBuffer[2]; // one buffer for each direction.
UnpinnedCopyEngine *_stagingBuffer[2]; // one buffer for each direction.
int _isLargeBar;
ihipCtx_t *_primaryCtx;
@@ -538,8 +538,8 @@ template <typename MUTEX_TYPE>
class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipCtxCriticalBase_t(unsigned deviceCnt) :
_peerCnt(0)
ihipCtxCriticalBase_t(unsigned deviceCnt) :
_peerCnt(0)
{
_peerAgents = new hsa_agent_t[deviceCnt];
};
@@ -580,12 +580,12 @@ private:
// Enabled peers have permissions to access the memory physically allocated on this device.
std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
uint32_t _peerCnt; // number of enabled peers
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
private:
void recomputePeerAgents();
};
// Note Mutex type Real/Fake selected based on CtxMutex
typedef ihipCtxCriticalBase_t<CtxMutex> ihipCtxCritical_t;
typedef ihipCtxCriticalBase_t<CtxMutex> ihipCtxCritical_t;
// This type is used by functions that need access to the critical device structures.
typedef LockedAccessor<ihipCtxCritical_t> LockedAccessor_CtxCrit_t;
@@ -594,19 +594,19 @@ typedef LockedAccessor<ihipCtxCritical_t> LockedAccessor_CtxCrit_t;
//=============================================================================
//class ihipCtx_t:
// A HIP CTX (context) points at one of the existing devices and contains the streams,
// A HIP CTX (context) points at one of the existing devices and contains the streams,
// peer-to-peer mappings, creation flags. Multiple contexts can point to the same
// device.
//
class ihipCtx_t
{
public: // Functions:
ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
~ihipCtx_t();
// Functions which read or write the critical data are named locked_.
// ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
// External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
// External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
// performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
void locked_addStream(ihipStream_t *s);
void locked_removeStream(ihipStream_t *s);
@@ -668,7 +668,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t);
inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
{
os << "stream#";
os << s.getDevice()->_deviceId;;
os << s.getDevice()->_deviceId;;
os << '.';
os << s._id;
return os;
@@ -37,12 +37,12 @@ THE SOFTWARE.
// engine. This routine is under development.
//
// Staging buffer provides thread-safe access via a mutex.
struct StagingBuffer {
struct UnpinnedCopyEngine {
static const int _max_buffers = 4;
StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ;
~StagingBuffer();
UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ;
~UnpinnedCopyEngine();
void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
+6 -6
مشاهده پرونده
@@ -85,7 +85,7 @@ int HIP_DISABLE_HW_COPY_DEP = 0;
std::once_flag hip_initialized;
// Array of pointers to devices.
// Array of pointers to devices.
ihipDevice_t **g_deviceArray;
@@ -106,7 +106,7 @@ thread_local int tls_defaultDeviceId = 0;
// This is the implicit context used by all HIP commands.
// It can be set by hipSetDevice or by the CTX manipulation commands:
thread_local ihipCtx_t *tls_defaultCtx;
thread_local ihipCtx_t *tls_defaultCtx;
thread_local hipError_t tls_lastHipError = hipSuccess;
@@ -550,7 +550,7 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
// ihipDevice_t
//=================================================================================================
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
_deviceId(deviceId),
_deviceId(deviceId),
_acc(acc)
{
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
@@ -567,8 +567,8 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato
initProperties(&_props);
_stagingBuffer[0] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
_stagingBuffer[1] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
_stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
_stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD);
_primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost);
}
@@ -1832,5 +1832,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
//// TODO - add identifier numbers for streams and devices to help with debugging.
#if ONE_OBJECT_FILE
#include "staging_buffer.cpp"
#include "unpinned_copy_engine.cpp"
#endif
@@ -21,7 +21,7 @@ THE SOFTWARE.
#include "hsa_ext_amd.h"
#include "hcc_detail/staging_buffer.h"
#include "hcc_detail/unpinned_copy_engine.h"
#ifdef HIP_HCC
#include "hcc_detail/hip_runtime.h"
@@ -62,7 +62,7 @@ hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) {
}
//-------------------------------------------------------------------------------------------------
StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) :
UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) :
_hsaAgent(hsaAgent),
_cpuAgent(cpuAgent),
_bufferSize(bufferSize),
@@ -93,7 +93,7 @@ StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t
//---
StagingBuffer::~StagingBuffer()
UnpinnedCopyEngine::~UnpinnedCopyEngine()
{
for (int i=0; i<_numBuffers; i++) {
if (_pinnedStagingBuffer[i]) {
@@ -112,7 +112,7 @@ StagingBuffer::~StagingBuffer()
//IN: dst - dest pointer - must be accessible from host CPU.
//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent)
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
std::lock_guard<std::mutex> l (_copy_lock);
@@ -166,7 +166,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
//IN: dst - dest pointer - must be accessible from host CPU.
//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent)
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){
memcpy(dst,src,sizeBytes);
@@ -227,7 +227,7 @@ void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, con
}
void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
std::lock_guard<std::mutex> l (_copy_lock);
@@ -272,7 +272,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
//IN: src - src pointer for copy. Must be accessible from host CPU.
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){
CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor);
@@ -339,7 +339,7 @@ void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, s
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
//IN: src - src pointer for copy. Must be accessible from host CPU.
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor)
void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor)
{
std::lock_guard<std::mutex> l (_copy_lock);