From 69aed2c6ea263cbbf52b2a9e27b55dc62e4dbcc1 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 26 Jul 2016 09:22:59 -0500 Subject: [PATCH] Added re-fix for memcpy kernel sync 1. The patch uses HIP signal pools to sync between copy and kernel commands 2. The hsa_signal_create is removed 3. Left the redundant enqueueBarrier method just in case Change-Id: I3dff3e8ee57fff3cd49bec802ff735ed128e5ca1 [ROCm/hip commit: 4bdf26a82efb863d23e5ffe8f62e00050f34374e] --- projects/hip/include/hcc_detail/hip_hcc.h | 2 +- projects/hip/src/hip_hcc.cpp | 28 +++++++++-------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/projects/hip/include/hcc_detail/hip_hcc.h b/projects/hip/include/hcc_detail/hip_hcc.h index 635b65f384..cf6c705082 100644 --- a/projects/hip/include/hcc_detail/hip_hcc.h +++ b/projects/hip/include/hcc_detail/hip_hcc.h @@ -444,7 +444,7 @@ private: std::vector _depFutures; private: - void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal); void enqueueBarrier(hsa_queue_t* queue, hsa_signal_t *depSignal); void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal); diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 70890d9374..46f0b2674f 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -338,7 +338,7 @@ ihipSignal_t *ihipStream_t::allocSignal(LockedAccessor_StreamCrit_t &crit) //--- -void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal) { // Obtain the write index for the command queue @@ -356,9 +356,9 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; barrier->header = header; - barrier->dep_signal[0] = depSignal->_hsa_signal; + barrier->dep_signal[0].handle = depSignal ? depSignal->_hsa_signal.handle: 0; - barrier->completion_signal.handle = 0; + barrier->completion_signal.handle = completionSignal ? completionSignal->_hsa_signal.handle : 0; // TODO - check queue overflow, return error: // Increment write index and ring doorbell to dispatch the kernel @@ -390,10 +390,7 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, hsa_signal_t *depSignal) barrier->dep_signal[3].handle = 0; barrier->dep_signal[4].handle = 0; - hsa_signal_t signal; - hsa_signal_create(1, 0, NULL, &signal); - *depSignal = signal; - barrier->completion_signal = signal; + barrier->completion_signal = *depSignal; // TODO - check queue overflow, return error: // Increment write index and ring doorbell to dispatch the kernel @@ -421,7 +418,7 @@ bool ihipStream_t::lockopen_preKernelCommand() hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); if (HIP_DISABLE_HW_KERNEL_DEP == 0) { - this->enqueueBarrier(q, crit->_last_copy_signal); + this->enqueueBarrier(q, crit->_last_copy_signal, NULL); tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel], crit->_last_copy_signal->_sig_id) @@ -472,7 +469,10 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType]); needSync = 1; - this->enqueueBarrier(static_cast(_av.get_hsa_queue()), waitSignal); + ihipSignal_t *depSignal = allocSignal(crit); + hsa_signal_store_relaxed(depSignal->_hsa_signal,1); + this->enqueueBarrier(static_cast(_av.get_hsa_queue()), NULL, depSignal); + *waitSignal = depSignal->_hsa_signal; // hsa_signal_t *hsaSignal = (static_cast (crit->_last_kernel_future.get_native_handle())); /* if (hsaSignal) { // Keep reference to the kernel future in order to keep the @@ -1272,7 +1272,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) // HIP uses only 64 kernels. If the performance decrease, add more uint32_t kernelCount = 0; -std::vector vCF(64); +std::vector vCF(1024); void incKernelCnt(hc::completion_future *cf){ vCF[kernelCount] = cf; @@ -1280,7 +1280,7 @@ void incKernelCnt(hc::completion_future *cf){ } void decKernelCnt(){ - if(kernelCount > 63){ + if(kernelCount > 1023){ uint32_t len = kernelCount; for(uint32_t i =0;i_last_command_type == ihipCommandKernel){ - hsa_signal_destroy(depSignal); - } } @@ -1820,9 +1817,6 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); - if (crit->_last_command_type == ihipCommandKernel) { - hsa_signal_destroy(depSignal); - } if (hsa_status == HSA_STATUS_SUCCESS) { if (HIP_LAUNCH_BLOCKING) {