From 96dc74897df03a76559728f54c560efb751ca829 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Wed, 19 Jun 2019 20:29:05 -0400 Subject: [PATCH] [hip] implement the hipExtLaunchMultiKernelMultiDevice API (#1165) * [hip] implement the hipExtLaunchMultiKernelMultiDevice API * add a guard to check the HCC version for acquire_locked_hsa_queue() API which was introdued in HCC for ROCm 2.5 * modified code based on the requested changes * changes to lock all streams before launching kernels for each device and unlock them after the dispatches * check each stream to be valid before starting to lock all the streams --- include/hip/hcc_detail/hip_runtime.h | 10 +-- include/hip/hcc_detail/program_state.hpp | 1 + src/hip_hcc.cpp | 49 +++++++++------ src/hip_hcc_internal.h | 4 +- src/hip_module.cpp | 78 ++++++++++++++++++++++-- 5 files changed, 113 insertions(+), 29 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 68406f0e29..6ac402a6f7 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -270,14 +270,14 @@ static inline __device__ void printf(const char* format, All... all) {} #if defined __HCC_CPP__ extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, - grid_launch_parm* lp, const char* kernelNameStr); + grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0); extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, - grid_launch_parm* lp, const char* kernelNameStr); + grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0); extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, - grid_launch_parm* lp, const char* kernelNameStr); + grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0); extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, - grid_launch_parm* lp, const char* kernelNameStr); -extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp); + grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0); +extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0); #if GENERIC_GRID_LAUNCH == 0 //#warning "Original hipLaunchKernel defined" diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp index c64b64fde8..4e0fbb76a4 100644 --- a/include/hip/hcc_detail/program_state.hpp +++ b/include/hip/hcc_detail/program_state.hpp @@ -77,6 +77,7 @@ class kernargs_size_align { public: std::size_t size(std::size_t n) const; std::size_t alignment(std::size_t n) const; + const void* getHandle() const {return handle;}; private: const void* handle; friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 583017a30a..4edf9575be 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -404,7 +404,7 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() { //--- // Must be called after kernel finishes, this releases the lock on the stream so other commands can // submit. -void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av) { +void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockPostponed) { bool blockThisKernel = false; if (!g_hipLaunchBlockingKernels.empty()) { @@ -426,7 +426,10 @@ void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accel kernelName); } - _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand. + // if unlockPostponed is true then this stream will be unlocked later (e.g., see hipExtLaunchMultiKernelMultiDevice for a sample call) + if (!unlockPostponed) { + _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand. + } }; @@ -1493,7 +1496,7 @@ void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) { // // If stream==NULL synchronize appropriately with other streams and return the default av for the // device. If stream is valid, return the AV to use. -hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { +hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) { if (stream == hipStreamNull) { // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams: ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); @@ -1535,9 +1538,14 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (needGatherMarker) { // ensure any commands sent to this stream wait on the NULL stream before // continuing - LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); - // TODO - could be "noret" version of create_blocking_marker - thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); + if (!lockAcquired) { + LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); + // TODO - could be "noret" version of create_blocking_marker + thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); + } else { + // this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice) + stream->criticalData()._av.create_blocking_marker(dcf, hc::accelerator_scope); + } tprintf( DB_SYNC, " %s adding marker to wait for freshly recorded default-stream marker \n", @@ -1578,8 +1586,8 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp, - const char* kernelNameStr) { - stream = ihipSyncAndResolveStream(stream); + const char* kernelNameStr, bool lockAcquired) { + stream = ihipSyncAndResolveStream(stream, lockAcquired); lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; @@ -1589,8 +1597,13 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ lp->barrier_bit = barrier_bit_queue_default; lp->launch_fence = -1; - auto crit = stream->lockopen_preKernelCommand(); - lp->av = &(crit->_av); + if (!lockAcquired) { + auto crit = stream->lockopen_preKernelCommand(); + lp->av = &(crit->_av); + } else { + // this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice) + lp->av = &(stream->criticalData()._av); + } lp->cf = nullptr; ihipPrintKernelLaunch(kernelNameStr, lp, stream); @@ -1599,30 +1612,30 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm* lp, - const char* kernelNameStr) { - return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr); + const char* kernelNameStr, bool lockAcquired) { + return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr, lockAcquired); } hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm* lp, - const char* kernelNameStr) { - return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr); + const char* kernelNameStr, bool lockAcquired) { + return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr, lockAcquired); } hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm* lp, - const char* kernelNameStr) { - return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr); + const char* kernelNameStr, bool lockAcquired) { + return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr, lockAcquired); } //--- // Called after kernel finishes execution. // This releases the lock on the stream. -void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp) { +void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed) { tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n"); - stream->lockclose_postKernelCommand(kernelName, lp.av); + stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed); if (HIP_PROFILE_API) { MARKER_END(); } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 7695ea34c1..852e8985ad 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -550,7 +550,7 @@ class ihipStream_t { // Member functions that begin with locked_ are thread-safe accessors - these acquire / release // the critical mutex. LockedAccessor_StreamCrit_t lockopen_preKernelCommand(); - void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av); + void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0); void locked_wait(); @@ -952,7 +952,7 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod, const char* name, hsa_agent_t *agent); -hipStream_t ihipSyncAndResolveStream(hipStream_t); +hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0); hipError_t ihipStreamSynchronize(hipStream_t stream); void ihipStreamCallbackHandler(ihipStreamCallback_t* cb); diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 78ecea0488..bda6970298 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -150,7 +150,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -206,8 +206,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, sharedMemBytes; // TODO - this should be part of preLaunchKernel. hStream = ihipPreLaunchKernel( hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ), - dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str()); - + dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked); hsa_kernel_dispatch_packet_t aql; @@ -272,7 +271,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, stopEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStopCommand); } - ihipPostLaunchKernel(f->_name.c_str(), hStream, lp); + ihipPostLaunchKernel(f->_name.c_str(), hStream, lp, isStreamLocked); + + } return ret; @@ -315,6 +316,75 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0)); } +hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags) { + + hipError_t result; + + if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) { + return hipErrorInvalidValue; + } + + hipFunction_t* kds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); + if (kds == nullptr) { + return hipErrorNotInitialized; + } + + // prepare all kernel descriptors for each device as all streams will be locked in the next loop + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + if (lp.stream == nullptr) { + free(kds); + return hipErrorNotInitialized; + } + kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast(lp.func), + hip_impl::target_agent(lp.stream)); + if (kds[i] == nullptr) { + free(kds); + return hipErrorInvalidValue; + } + hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align( + reinterpret_cast(lp.func)); + kds[i]->_kernarg_layout = *reinterpret_cast>*>( + kargs.getHandle()); + } + + // lock all streams before launching kernels to each device + for (int i = 0; i < numDevices; ++i) { + LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); + #if (__hcc_workweek__ >= 19213) + streamCrit->_av.acquire_locked_hsa_queue(); + #endif + } + + // launch kernels for each device + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + + result = ihipModuleLaunchKernel(kds[i], + lp.gridDim.x * lp.blockDim.x, + lp.gridDim.y * lp.blockDim.y, + lp.gridDim.z * lp.blockDim.z, + lp.blockDim.x, lp.blockDim.y, + lp.blockDim.z, lp.sharedMem, + lp.stream, lp.args, nullptr, nullptr, nullptr, 0, + true /* stream is already locked above and will be unlocked + in the below code after launching kernels on all devices*/); + } + + // unlock all streams + for (int i = 0; i < numDevices; ++i) { + launchParamsList[i].stream->criticalData().unlock(); + #if (__hcc_workweek__ >= 19213) + launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); + #endif + } + + free(kds); + + return result; +} + namespace hip_impl { hsa_executable_t executable_for(hipModule_t hmod) { return hmod->executable;