From 2e754d27dc5932edaee8cb0a0f0ae2102d76a430 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 26 Jul 2016 14:03:51 -0500 Subject: [PATCH] Signal Fix: Changed global signal count to per stream signal count 1. The number of kernels that can use signals are increased to 128 2. The kernel count is now specific to the stream Change-Id: Ie6d1aa3f437aad8f08c3333fe48bd3f46e551e60 --- hipamd/include/hcc_detail/hip_hcc.h | 2 +- hipamd/src/hip_hcc.cpp | 35 ++++++----------------------- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/hipamd/include/hcc_detail/hip_hcc.h b/hipamd/include/hcc_detail/hip_hcc.h index cf6c705082..8637f62457 100644 --- a/hipamd/include/hcc_detail/hip_hcc.h +++ b/hipamd/include/hcc_detail/hip_hcc.h @@ -393,7 +393,7 @@ typedef LockedAccessor LockedAccessor_StreamCrit_t; class ihipStream_t { public: typedef uint64_t SeqNum_t ; - + uint32_t kernelCnt; ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); ~ihipStream_t(); diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index b89fc43582..20deb69aef 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -134,6 +134,7 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig _flags(flags), _device_index(device_index) { + kernelCnt = 0; tprintf(DB_SYNC, " streamCreate: stream=%p\n", this); }; @@ -398,7 +399,7 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, hsa_signal_t *depSignal) hsa_signal_store_relaxed(queue->doorbell_signal, index); } - +#define HIP_NUM_KERNELS_INFLIGHT 128 //-- //When the commands in a stream change types (ie kernel command follows a data command, @@ -411,6 +412,11 @@ bool ihipStream_t::lockopen_preKernelCommand() LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); bool addedSync = false; + if(kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ + this->wait(crit); + kernelCnt = 0; + } + kernelCnt++; // If switching command types, we need to add a barrier packet to synchronize things. if (crit->_last_command_type != ihipCommandKernel) { if (crit->_last_copy_signal) { @@ -1270,28 +1276,6 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } } -// HIP uses only 64 kernels. If the performance decrease, add more -uint32_t kernelCount = 0; -std::vector vCF(1024); - -void incKernelCnt(hc::completion_future *cf){ - vCF[kernelCount] = cf; - kernelCount++; -} - -void decKernelCnt(){ - if(kernelCount > 1023){ - uint32_t len = kernelCount; - for(uint32_t i =0;iwait(); - } - delete vCF[i]; - vCF[i] = NULL; - kernelCount--; - } - } -} // TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. @@ -1321,7 +1305,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ // *av = &stream->_av; lp->av = &stream->_av; lp->cf = new hc::completion_future; - incKernelCnt(lp->cf); // lp->av = static_cast(av); // lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); @@ -1351,7 +1334,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri // *av = &stream->_av; lp->av = &stream->_av; lp->cf = new hc::completion_future; - incKernelCnt(lp->cf); // lp->av = static_cast(av); // lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); @@ -1382,7 +1364,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri // *av = &stream->_av; lp->av = &stream->_av; lp->cf = new hc::completion_future; - incKernelCnt(lp->cf); // lp->av = static_cast(av); // lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); @@ -1413,7 +1394,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g // *av = &stream->_av; lp->av = &stream->_av; lp->cf = new hc::completion_future; - incKernelCnt(lp->cf); // lp->av = static_cast(av); // lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); @@ -1426,7 +1406,6 @@ void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp) { // stream->lockclose_postKernelCommand(cf); stream->lockclose_postKernelCommand(*lp.cf); - decKernelCnt(); if (HIP_LAUNCH_BLOCKING) { tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream); }