From 5d6fd17fbe06ded37cf68212a2c8e3877549f23a Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 6 Mar 2019 12:55:39 +0530 Subject: [PATCH] Add extension for kernel concurrency on same stream [ROCm/hip commit: 59081c69fc51fa1f112b08e2eef4f58cbcb42ee1] --- projects/hip/include/hip/hip_hcc.h | 12 ++++++++- .../0_Intro/module_api/launchKernelHcc.cpp | 2 +- projects/hip/src/hip_module.cpp | 25 +++++++++++++++---- projects/hip/src/hip_stream.cpp | 9 ++++--- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/projects/hip/include/hip/hip_hcc.h b/projects/hip/include/hip/hip_hcc.h index c07a57fb3a..68889ced4e 100644 --- a/projects/hip/include/hip/hip_hcc.h +++ b/projects/hip/include/hip/hip_hcc.h @@ -89,13 +89,23 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a * HIP/ROCm actually updates the start event when the associated kernel completes. */ +hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, + uint32_t localWorkSizeZ, size_t sharedMemBytes, + hipStream_t hStream, void** kernelParams, void** extra, + hipEvent_t startEvent = nullptr, + hipEvent_t stopEvent = nullptr, + uint32_t flags = 0); + hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent = nullptr, - hipEvent_t stopEvent = nullptr); + hipEvent_t stopEvent = nullptr) + __attribute__((deprecated("use hipExtModuleLaunchKernel instead")));; // doxygen end HCC-specific features /** diff --git a/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp b/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp index f6bb9e5361..217f60891d 100644 --- a/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp +++ b/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp @@ -88,7 +88,7 @@ int main() { HIP_LAUNCH_PARAM_END}; HIP_CHECK( - hipHccModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config)); + hipExtModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config, 0)); hipMemcpyDtoH(B, Bd, SIZE); diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index 780240c067..c2519bc0d8 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -131,7 +131,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) { auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; @@ -203,8 +203,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, aql.kernel_object = f->_object; aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; aql.header = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER); // TODO - honor queue setting for execute_in_order + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE); + if((flags & 0x1)== 0 ) { + //in_order + aql.header |= (1 << HSA_PACKET_HEADER_BARRIER); + } if (HCC_OPT_FLUSH) { aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | @@ -251,9 +254,21 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr hStream, kernelParams, extra); return ihipLogStatus(ihipModuleLaunchKernel( f, blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ, blockDimX, blockDimY, - blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr)); + blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0)); } +hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, + uint32_t localWorkSizeZ, size_t sharedMemBytes, + hipStream_t hStream, void** kernelParams, void** extra, + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) { + HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, + localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); + return ihipLogStatus(ihipModuleLaunchKernel( + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, + localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags)); +} hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, @@ -265,7 +280,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); return ihipLogStatus(ihipModuleLaunchKernel( f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, - localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); + localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0)); } namespace { diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index 5331b41576..c328e34b79 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -61,8 +61,11 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags, int priorit // TODO - se try-catch loop to detect memory exception? // - // Note this is an execute_in_order queue, so all kernels submitted will atuomatically - // wait for prev to complete: This matches CUDA stream behavior: + // Note this is an execute_any_order queue, + // CUDA stream behavior is that all kernels submitted will automatically + // wait for prev to complete, this behaviour will be mainatined by + // hipModuleLaunchKernel. execute_any_order will help + // hipExtModuleLaunchKernel , which uses a special flag { // Obtain mutex access to the device critical data, release by destructor @@ -71,7 +74,7 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags, int priorit #if defined(__HCC__) && (__hcc_minor__ < 3) auto istream = new ihipStream_t(ctx, acc.create_view(), flags); #else - auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_in_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags); + auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_any_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags); #endif ctxCrit->addStream(istream);