Refactor staging buffer and sync copies.

- refactor staging buffer to operate on hsa* data structures not
  hc::accelerator.
- use hsa_memory_allocate to allocate staging buffers rather than
  am_alloc.
- Refactor device reset with single member function.  Don't reallocate
  staging buffers on reset.
- Properly track dependencies based on command type.  Add new deps for
  H2D and D2D rather than overloading H2D.
Этот коммит содержится в:
Ben Sander
2016-03-17 20:09:10 -05:00
родитель 1b7cc7d921
Коммит 3b45e064f9
3 изменённых файлов: 57 добавлений и 51 удалений
+1 -2
Просмотреть файл
@@ -8,7 +8,7 @@ struct StagingBuffer {
static const int _max_buffers = 4;
StagingBuffer(hc::accelerator &acc, size_t bufferSize, int numBuffers) ;
StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) ;
~StagingBuffer();
void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
@@ -19,7 +19,6 @@ struct StagingBuffer {
private:
hc::accelerator &_acc;
hsa_agent_t _hsa_agent;
size_t _bufferSize; // Size of the buffers.
int _numBuffers;
+51 -40
Просмотреть файл
@@ -85,11 +85,13 @@ int HIP_DISABLE_BIDIR_MEMCPY = 0;
// Stream functions will acquire a mutex before entering critical sections.
#define STREAM_THREAD_SAFE 1
// If FORCE_SAMEDIR_COPY_DEP=1 , HIP runtime will add
// synchronization for sequential commands in the same stream.
// If FORCE_SAMEDIR_COPY_DEP=0 data copies in the same direction are assumed to be correctly ordered.
// If FORCE_COPY_DEP=1 , HIP runtime will add
// synchronization for copy commands in the same stream, regardless of command type.
// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered.
// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
// when using shader queues.
// TODO - measure if this matters for performance, in particular for back-to-back small copies.
// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands.
#define FORCE_SAMEDIR_COPY_DEP 1
@@ -161,13 +163,15 @@ struct ihipDevice_t;
enum ihipCommand_t {
ihipCommandKernel,
ihipCommandCopyH2D,
ihipCommandCopyH2H,
ihipCommandCopyH2D,
ihipCommandCopyD2H,
ihipCommandCopyD2D,
ihipCommandKernel,
};
const char* ihipCommandName[] = {
"Kernel", "CopyH2D", "CopyD2H"
"CopyH2H", "CopyH2D", "CopyD2H", "CopyD2D", "Kernel"
};
@@ -227,7 +231,6 @@ public:
inline void postKernelCommand(hc::completion_future &kernel_future);
inline int preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
inline void postCopyCommand();
inline void reclaimSignals_ts(SIGSEQNUM sigNum);
inline void wait();
@@ -629,11 +632,6 @@ int ihipStream_t::preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSigna
}
//---
inline void ihipStream_t::postCopyCommand()
{
//_mutex.unlock();
}
//=================================================================================================
@@ -642,8 +640,12 @@ inline void ihipStream_t::postCopyCommand()
//Device may be reset multiple times, and may be reset after init.
void ihipDevice_t::reset()
{
_staging_buffer[0] = new StagingBuffer(_acc, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
_staging_buffer[1] = new StagingBuffer(_acc, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
// Reset and remove streams:
_streams.clear();
// Reset and release all memory stored in the tracker:
am_memtracker_reset(_acc);
};
@@ -672,7 +674,11 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc)
hsa_signal_create(0, 0, NULL, &_copy_signal);
this->reset();
hsa_region_t *pinnedHostRegion;
pinnedHostRegion = static_cast<hsa_region_t*>(_acc.get_hsa_am_system_region());
_staging_buffer[0] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
_staging_buffer[1] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
};
@@ -686,6 +692,7 @@ ihipDevice_t::~ihipDevice_t()
for (int i=0; i<2; i++) {
if (_staging_buffer[i]) {
delete _staging_buffer[i];
_staging_buffer[i] = NULL;
}
}
hsa_signal_destroy(_copy_signal);
@@ -1343,21 +1350,17 @@ hipError_t hipDeviceReset(void)
// It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries.
//---
//Wait for pending activity to complete?
//TODO - check if this is required behavior:
for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) {
ihipStream_t *stream = *streamI;
stream->wait();
}
// Reset and remove streams:
device->_streams.clear();
if (device) {
am_memtracker_reset(device->_acc);
device->reset(); // re-allocate required resources.
//---
//Wait for pending activity to complete?
//TODO - check if this is required behavior:
for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) {
ihipStream_t *stream = *streamI;
stream->wait();
}
// Release device resources (streams and memory):
device->reset();
}
return ihipLogStatus(hipSuccess);
@@ -2249,10 +2252,10 @@ void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hi
}
hsa_signal_t depSignal;
int depSignalCnt = this->preCopyCommand(NULL, &depSignal, ihipCommandCopyH2D);
if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) {
int depSignalCnt = this->preCopyCommand(NULL, &depSignal, ihipCommandCopyH2D);
if (HIP_STAGING_BUFFERS) {
std::lock_guard<std::mutex> l (device->_copy_lock[0]);
tprintf(DB_COPY1, "D2H && dstNotTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
@@ -2263,7 +2266,7 @@ void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hi
device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
}
// The copy waits for inputs and then completes before returning.
// The copy waits for inputs and then completes before returning so can reset queue to empty:
this->resetToEmpty();
} else {
// TODO - remove, slow path.
@@ -2271,17 +2274,23 @@ void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hi
hc::am_copy(dst, src, sizeBytes);
}
} else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) {
int depSignalCnt = this->preCopyCommand(NULL, &depSignal, ihipCommandCopyD2H);
if (HIP_STAGING_BUFFERS) {
tprintf(DB_COPY1, "D2H && dstNotTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
std::lock_guard<std::mutex> l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]);
//printf ("staged-copy- read dep signals\n");
device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
// The copy waits for inputs and then completes before returning so can reset queue to empty:
this->resetToEmpty();
} else {
// TODO - remove, slow path.
tprintf(DB_COPY1, "D2H && dstNotTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
hc::am_copy(dst, src, sizeBytes);
}
} else if (kind == hipMemcpyHostToHost) { // TODO-refactor.
} else if (kind == hipMemcpyHostToHost) {
int depSignalCnt = this->preCopyCommand(NULL, &depSignal, ihipCommandCopyH2H);
if (depSignalCnt) {
// host waits before doing host memory copy.
@@ -2291,14 +2300,16 @@ void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hi
memcpy(dst, src, sizeBytes);
} else {
ihipCommand_t copyType;
if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) {
copyType = ihipCommandCopyH2D;
} else if (kind == hipMemcpyDeviceToHost) {
copyType = ihipCommandCopyD2H;
} else {
throw ihipException(hipErrorInvalidMemcpyDirection);
}
// If not special case - these can all be handled by the hsa async copy:
ihipCommand_t commandType;
switch (kind) {
case hipMemcpyHostToHost : commandType = ihipCommandCopyH2H; break;
case hipMemcpyHostToDevice : commandType = ihipCommandCopyH2D; break;
case hipMemcpyDeviceToHost : commandType = ihipCommandCopyD2H; break;
case hipMemcpyDeviceToDevice : commandType = ihipCommandCopyD2D; break;
default: throw ihipException(hipErrorInvalidMemcpyDirection);
};
int depSignalCnt = this->preCopyCommand(NULL, &depSignal, commandType);
device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock();
+5 -9
Просмотреть файл
@@ -15,20 +15,16 @@
#endif
//-------------------------------------------------------------------------------------------------
StagingBuffer::StagingBuffer(hc::accelerator &acc, size_t bufferSize, int numBuffers) :
_acc(acc),
StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) :
_hsa_agent(hsaAgent),
_bufferSize(bufferSize),
_numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers)
{
hsa_agent_t *agentPtr = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
_hsa_agent = *agentPtr;
for (int i=0; i<_numBuffers; i++) {
// TODO - experiment with alignment here.
_pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, _acc, amHostPinned);
if (_pinnedStagingBuffer[i] == NULL) {
hsa_status_t s1 = hsa_memory_allocate(systemRegion, _bufferSize, (void**) (&_pinnedStagingBuffer[i]) );
if ((s1 != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) {
THROW_ERROR(hipErrorMemoryAllocation);
}
hsa_signal_create(0, 0, NULL, &_completion_signal[i]);