From 6472c8b9152d7ba76b9edd3d5e10d3f33c3fb5cf Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 19 Mar 2016 02:44:26 -0500 Subject: [PATCH] Fix copy and sync bugs. Remove extra sync in default stream. - NULL stream was waiting for itself to be empty before each command. - Force "blocking" streams to wait for NULL to empty. This was missing before. - async copy was disabling itself via trueAsync=false for common cases. Refactor: - rename _null_stream to _default_stream. - move some null sync function to defaultSync, move to dev member func. --- src/hip_hcc.cpp | 187 ++++++++++++++++++++++++----------------- src/staging_buffer.cpp | 5 +- 2 files changed, 112 insertions(+), 80 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 37892b777e..a59c53f7c9 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -44,6 +44,7 @@ THE SOFTWARE. #include "hsa_ext_amd.h" // HIP includes: +#define HIP_HCC #include "hcc_detail/staging_buffer.h" @@ -314,7 +315,7 @@ struct ihipDevice_t // The NULL stream is used if no other stream is specified. // NULL has special synchronization properties with other streams. - ihipStream_t *_null_stream; + ihipStream_t *_default_stream; std::list _streams; // streams associated with this device. @@ -327,6 +328,8 @@ public: void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); + inline void syncDefaultStream(bool waitOnSelf); + ~ihipDevice_t(); }; @@ -363,10 +366,10 @@ INLINE bool ihipIsValidDevice(unsigned deviceIndex); //--- ihipSignal_t::ihipSignal_t() : _sig_id(0) { - if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { - throw ihipException(hipErrorOutOfResources); -} -tprintf (DB_SIGNAL, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); + if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw ihipException(hipErrorOutOfResources); + } + //tprintf (DB_SIGNAL, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); } //--- @@ -420,13 +423,15 @@ void ihipStream_t::reclaimSignals_ts(SIGSEQNUM sigNum) //--- void ihipStream_t::waitCopy(ihipSignal_t *signal) { - hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - SIGSEQNUM sigNum = _last_copy_signal->_sig_id; + SIGSEQNUM sigNum = signal->_sig_id; - tprintf(DB_SIGNAL, "reclaim signal #%lu\n", sigNum); - // Mark all signals older and including this one as available for - _oldest_live_sig_id = sigNum+1; + tprintf(DB_SIGNAL, "waitCopy reclaim signal #%lu\n", sigNum); + // Mark all signals older and including this one as available for reclaim + if (sigNum > _oldest_live_sig_id) { + _oldest_live_sig_id = sigNum+1; // TODO, +1 here seems dangerous. + } } @@ -438,9 +443,9 @@ void ihipStream_t::wait(bool assertQueueEmpty) if (! assertQueueEmpty) { tprintf (DB_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 ); _av.wait(); - if (_last_copy_signal) { - this->waitCopy(_last_copy_signal); - } + } + if (_last_copy_signal) { + this->waitCopy(_last_copy_signal); } // Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. @@ -474,8 +479,13 @@ ihipSignal_t *ihipStream_t::allocSignal() } if (_signalPool[thisCursor]._sig_id < _oldest_live_sig_id) { + SIGSEQNUM oldSigId = _signalPool[thisCursor]._sig_id; _signalPool[thisCursor]._index = thisCursor; _signalPool[thisCursor]._sig_id = ++_stream_sig_id; // allocate it. + tprintf(DB_SIGNAL, "allocatSignal #%lu at pos:%i (old sigId:%lu < oldest_live:%lu)\n", + _signalPool[thisCursor]._sig_id, + thisCursor, oldSigId, _oldest_live_sig_id); + return &_signalPool[thisCursor]; @@ -487,6 +497,9 @@ ihipSignal_t *ihipStream_t::allocSignal() // Have to grow the pool: _signalCursor = _signalPool.size(); // set to the beginning of the new entries: + if (_signalCursor > 10000) { + fprintf (stderr, "warning: signal pool size=%d, may indicate runaway number of inflight commands\n", _signalCursor); + } _signalPool.resize(_signalPool.size() * 2); tprintf (DB_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); return allocSignal(); // try again, @@ -661,9 +674,9 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) getProperties(&_props); - _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); - this->_streams.push_back(_null_stream); - tprintf(DB_SYNC, "created device with null_stream=%p\n", _null_stream); + _default_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); + this->_streams.push_back(_default_stream); + tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream); hsa_region_t *pinnedHostRegion; @@ -676,9 +689,9 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) ihipDevice_t::~ihipDevice_t() { - if (_null_stream) { - delete _null_stream; - _null_stream = NULL; + if (_default_stream) { + delete _default_stream; + _default_stream = NULL; } for (int i=0; i<2; i++) { @@ -905,6 +918,32 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) return e; } + +// Implement "default" stream syncronization +// This waits for all other streams to drain before continuing. +// If waitOnSelf is set, this additionally waits for the default stream to empty. +void ihipDevice_t::syncDefaultStream(bool waitOnSelf) +{ + tprintf(DB_SYNC, "syncDefaultStream\n"); + + for (auto streamI=_streams.begin(); streamI!=_streams.end(); streamI++) { + ihipStream_t *stream = *streamI; + + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream + if (!(stream->_flags & hipStreamNonBlocking)) { + + if (waitOnSelf || (stream != _default_stream)) { + // TODO-hcc - use blocking or active wait here? + // TODO-sync - cudaDeviceBlockingSync + stream->wait(); + } + } + } +} + + + #define ihipLogStatus(_hip_status) \ ({\ tls_lastHipError = _hip_status;\ @@ -1000,7 +1039,7 @@ void ihipInit() //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); - READ_ENV_I(release, HIP_DB, 0, "Print various debug info. Bitmasl, see hip_hcc.cpp for more information."); + READ_ENV_I(release, HIP_DB, 0, "Print various debug info. Bitmask, see hip_hcc.cpp for more information."); if ((HIP_DB & DB_API) && (HIP_TRACE_API == 0)) { // Set HIP_TRACE_API before we read it, so it is printed correctly. HIP_TRACE_API = 1; @@ -1010,7 +1049,7 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); - READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy"); + READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development."); READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" ); @@ -1113,19 +1152,6 @@ static inline void ihipWaitAllStreams(ihipDevice_t *device) -inline void ihipWaitNullStream(ihipDevice_t *device) -{ - tprintf(DB_SYNC, "waitNullStream\n"); - - for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - ihipStream_t *stream = *streamI; - if (!(stream->_flags & hipStreamNonBlocking)) { - // TODO-hcc - use blocking or active wait here? - // TODO-sync - cudaDeviceBlockingSync - stream->wait(); - } - } -} //--- @@ -1137,10 +1163,16 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { ihipDevice_t *device = ihipGetTlsDefaultDevice(); - ihipWaitNullStream(device); + device->syncDefaultStream(false); - return device->_null_stream; + return device->_default_stream; } else { + // Have to wait for legacy default stream to be empty: + if (!(stream->_flags & hipStreamNonBlocking)) { + tprintf(DB_SYNC, "stream %p wait default stream\n", stream); + stream->getDevice()->_default_stream->wait(); + } + return stream; } } @@ -1584,7 +1616,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) if (stream == NULL) { ihipDevice_t *device = ihipGetTlsDefaultDevice(); - ihipWaitNullStream(device); + device->syncDefaultStream(true/*waitOnSelf*/); } else { stream->wait(); e = hipSuccess; @@ -1608,7 +1640,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { ihipDevice_t *device = ihipGetTlsDefaultDevice(); - ihipWaitNullStream(device); + device->syncDefaultStream(true/*waitOnSelf*/); } else { stream->wait(); e = hipSuccess; @@ -1690,8 +1722,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) // TODO-HCC fix this - is CUDA this conservative or still uses device timestamps? // TODO-HCC can we use barrier or event marker to implement better solution? ihipDevice_t *device = ihipGetTlsDefaultDevice(); - ihipWaitNullStream(device); - + device->syncDefaultStream(true); eh->_timestamp = hc::get_system_ticks(); eh->_state = hipEventStatusRecorded; @@ -1741,7 +1772,7 @@ hipError_t hipEventSynchronize(hipEvent_t event) return ihipLogStatus(hipSuccess); } else if (eh->_stream == NULL) { ihipDevice_t *device = ihipGetTlsDefaultDevice(); - ihipWaitNullStream(device); + device->syncDefaultStream(true); return ihipLogStatus(hipSuccess); } else { #if __hcc_workweek__ >= 16033 @@ -2197,7 +2228,7 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou auto device = ihipGetTlsDefaultDevice(); //hsa_signal_t depSignal; - //int depSignalCnt = device._null_stream->preCopyCommand(NULL, &depSignal, ihipCommandCopyH2D); + //int depSignalCnt = device._default_stream->preCopyCommand(NULL, &depSignal, ihipCommandCopyH2D); assert(0); // Need to properly synchronize the copy - do something with depSignal if != NULL. device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); @@ -2311,11 +2342,10 @@ void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hi // This is sync copy, so let's wait for copy right here: if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(copyCompleteSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + waitCopy(ihipSignal); // wait for copy, and return to pool. } else { throw ihipException(hipErrorInvalidValue); } - } } @@ -2349,7 +2379,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind /** - * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue + * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, + * @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument. * @warning on HCC hipMemcpyAsync does not support overlapped H2D and D2H copies. * @warning on HCC hipMemcpyAsync requires that any host pointers are pinned (ie via the hipMallocHost call). */ @@ -2364,7 +2395,9 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp bool trueAsync = true; - if (stream) { + if ((dst == NULL) || (src == NULL)) { + e= hipErrorInvalidValue; + } else if (stream) { ihipDevice_t *device = stream->getDevice(); if (device == NULL) { @@ -2373,6 +2406,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp } else if (kind == hipMemcpyHostToHost) { tprintf (DB_COPY2, "H2H copy with memcpy"); + // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the host-side sync here and keep everything flowing on the GPU. /* As this is a CPU op, we need to wait until all the commands in current stream are finished. */ @@ -2384,36 +2418,37 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp hc::accelerator acc; hc::AmPointerInfo dstAm(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcAm(NULL, NULL, 0, acc, 0, 0); - am_status_t statDst = hc::am_memtracker_getinfo(&dstAm, dst); - am_status_t statSrc = hc::am_memtracker_getinfo(&srcAm, src); + bool dstTracked = (hc::am_memtracker_getinfo(&dstAm, dst) == AM_SUCCESS); + bool srcTracked = (hc::am_memtracker_getinfo(&srcAm, src) == AM_SUCCESS); - if(dstAm._appAllocationFlags != 1 || srcAm._appAllocationFlags != 1){ + bool dstInDeviceMem = (dstTracked && dstAm._isInDeviceMem); + bool srcInDeviceMem = (srcTracked && srcAm._isInDeviceMem); + + // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. + // If both pointers are not tracked, we need to fall back to a sync copy. + if (!dstTracked || !srcTracked) { trueAsync = false; + } + + if (kind == hipMemcpyDefault) { + if (!dstInDeviceMem && !srcInDeviceMem) { + kind = hipMemcpyHostToHost; + } else if (dstInDeviceMem && !srcInDeviceMem) { + kind = hipMemcpyHostToDevice; + } else if (!dstInDeviceMem && srcInDeviceMem) { + kind = hipMemcpyDeviceToHost; + } else if (dstInDeviceMem && srcInDeviceMem) { + kind = hipMemcpyDeviceToHost; + } + + // If we still couldn't determine direction, flag error here: + if (kind == hipMemcpyDefault) { + return hipErrorInvalidMemcpyDirection; + } } - if (kind == hipMemcpyDefault) { - if(statDst == AM_SUCCESS && statSrc == AM_SUCCESS){ - if(dstAm._devicePointer != NULL){ - if(srcAm._devicePointer != NULL){ - kind = hipMemcpyDeviceToDevice; - } - if(srcAm._hostPointer != NULL){ - kind = hipMemcpyHostToDevice; - } - } - if(dstAm._hostPointer != NULL){ - if(srcAm._devicePointer != NULL){ - kind = hipMemcpyDeviceToHost; - } - if(srcAm._hostPointer != NULL){ - kind = hipMemcpyHostToHost; - } - } - } - else{ - return hipErrorInvalidMemcpyDirection; - } - } + + ihipSignal_t *ihip_signal = stream->allocSignal(); hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1); @@ -2693,7 +2728,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a if (stream == hipStreamNull ) { ihipDevice_t *device = ihipGetTlsDefaultDevice(); - stream = device->_null_stream; + stream = device->_default_stream; } *av = &(stream->_av); @@ -2703,11 +2738,11 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a } // TODO - review signal / error reporting code. -// TODO - describe naming convention. ihip _. No accessors. +// TODO - describe naming convention. ihip _. No accessors. No early returns from functions. Set status to success at top, only set error codes in implementation. No tabs. +// Caps convention _ or camelCase // TODO - describe MT strategy // -// -// +//// TODO - add identifier numbers for streams and devices to help with debugging. #if ONE_OBJECT_FILE #include "staging_buffer.cpp" diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index 1c799c50ea..dc2f5d715a 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -4,14 +4,11 @@ #include "hcc_detail/staging_buffer.h" -#ifndef tprintf -#define tprintf(trace_level, ...) -#endif - #ifdef HIP_HCC #define THROW_ERROR(e) throw ihipException(e) #else #define THROW_ERROR(e) throw +#define tprintf(trace_level, ...) #endif //-------------------------------------------------------------------------------------------------