diff --git a/projects/hip/include/hcc_detail/hip_hcc.h b/projects/hip/include/hcc_detail/hip_hcc.h index d97a41ef59..794b099a7a 100644 --- a/projects/hip/include/hcc_detail/hip_hcc.h +++ b/projects/hip/include/hcc_detail/hip_hcc.h @@ -530,12 +530,16 @@ public: bool removePeer(ihipDevice_t *peer); void resetPeers(ihipDevice_t *thisDevice); + + void addStream(ihipStream_t *stream); + uint32_t peerCnt() const { return _peerCnt; }; hsa_agent_t *peerAgents() const { return _peerAgents; }; private: - std::list _streams; // streams associated with this device. + //std::list< std::shared_ptr > _streams; // streams associated with this device. TODO - convert to shared_ptr. + std::list< ihipStream_t* > _streams; // streams associated with this device. ihipStream_t::SeqNum_t _stream_id; // These reflect the currently Enabled set of peers for this GPU: diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp index f70ab1d33a..86e8563272 100644 --- a/projects/hip/src/hip_device.cpp +++ b/projects/hip/src/hip_device.cpp @@ -172,12 +172,6 @@ hipError_t hipDeviceReset(void) if (device) { - //--- - //Wait for pending activity to complete? TODO - check if this is required behavior: - //TODO, also we have small window between wait and reset. - - device->locked_waitAllStreams(); - // Release device resources (streams and memory): device->locked_reset(); } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 3fba257698..75b786793d 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -151,9 +151,11 @@ void ihipStream_t::locked_reclaimSignals(SIGSEQNUM sigNum) //--- void ihipStream_t::waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal) { + SIGSEQNUM sigNum = signal->_sig_id; + tprintf(DB_SYNC, "waitCopy signal:#%lu\n", sigNum); + hsa_signal_wait_acquire(signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - SIGSEQNUM sigNum = signal->_sig_id; tprintf(DB_SIGNAL, "waitCopy reclaim signal #%lu\n", sigNum); // Mark all signals older and including this one as available for reclaim @@ -252,6 +254,14 @@ void ihipDeviceCriticalBase_t::resetPeers(ihipDevice_t *thisDevice) addPeer(thisDevice); // peer-list always contains self agent. } + +template<> +void ihipDeviceCriticalBase_t::addStream(ihipStream_t *stream) +{ + _streams.push_back(stream); + stream->_id = incStreamId(); +} + //------------------------------------------------------------------------------------------------- //--- @@ -461,9 +471,29 @@ void ihipDevice_t::locked_reset() // Obtain mutex access to the device critical data, release by destructor LockedAccessor_DeviceCrit_t crit(_criticalData); + + //--- + //Wait for pending activity to complete? TODO - check if this is required behavior: + tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n"); + // Reset and remove streams: + // Delete all created streams including the default one. + for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { + ihipStream_t *stream = *streamI; + (*streamI)->locked_wait(); + tprintf(DB_SYNC, " delete stream=%p\n", stream); + + delete stream; + } + // Clear the list. crit->streams().clear(); + + // Create a fresh default stream and add it: + _default_stream = new ihipStream_t(_device_index, _acc.get_default_view(), hipStreamDefault); + crit->addStream(_default_stream); + + // This resest peer list to just me: crit->resetPeers(this); @@ -499,10 +529,7 @@ void ihipDevice_t::init(unsigned device_index, unsigned deviceCnt, hc::accelerat locked_reset(); - _default_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); - locked_addStream(_default_stream); - tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream); hsa_region_t *pinnedHostRegion; @@ -783,8 +810,7 @@ void ihipDevice_t::locked_addStream(ihipStream_t *s) { LockedAccessor_DeviceCrit_t crit(_criticalData); - crit->streams().push_back(s); - s->_id = crit->incStreamId(); + crit->addStream(s); } //--- @@ -914,7 +940,7 @@ void ihipInit() READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_DB, 0, "Print various debug info. Bitmask, see hip_hcc.cpp for more information."); - if ((HIP_DB & DB_API) && (HIP_TRACE_API == 0)) { + if ((HIP_DB & (1< _bufferSize) ? _bufferSize : bytesRemaining0; + // Wait to make sure we are not overwriting a buffer before it has been drained: + hsa_signal_wait_acquire(_completion_signal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); @@ -311,7 +314,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* } tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); + hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], dstAgent /*not used*/, theseBytes, hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], _completion_signal2[bufferIndex]); diff --git a/projects/hip/tests/src/hipPeerToPeer_simple.cpp b/projects/hip/tests/src/hipPeerToPeer_simple.cpp index e5b61714b2..71d073b1b2 100644 --- a/projects/hip/tests/src/hipPeerToPeer_simple.cpp +++ b/projects/hip/tests/src/hipPeerToPeer_simple.cpp @@ -113,37 +113,22 @@ void enablePeerFirst() HIPCHECK (hipSetDevice(g_currentDevice)); HIPCHECK (hipMalloc(&A_d0, Nbytes) ); HIPCHECK (hipMemset(A_d0, memsetval, Nbytes) ); - // TODO - remove me: - HIPCHECK (hipDeviceSynchronize()); // allocate and initialize memory on peer device HIPCHECK (hipSetDevice(g_peerDevice)); HIPCHECK (hipMalloc(&A_d1, Nbytes) ); HIPCHECK (hipMemset(A_d1, 0x13, Nbytes) ); - // TODO - remove me: - HIPCHECK (hipDeviceSynchronize()); // Device0 push to device1, using P2P: HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice)); HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault)); // This is P2P copy. - // TODO - remove me: - if (1) { - HIPCHECK (hipSetDevice(g_currentDevice)); - HIPCHECK (hipDeviceSynchronize()); - HIPCHECK (hipSetDevice(g_peerDevice)); - HIPCHECK (hipDeviceSynchronize()); - } - // Copy data back to host: HIPCHECK (hipSetDevice(g_peerDevice)); HIPCHECK (hipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost)); - // TODO - remove me: - HIPCHECK (hipDeviceSynchronize()); - // Check host data: for (int i=0; i