Merge pull request #949 from gargrahul/single_stream_concurrent_kernels

Add extension for kernel concurrency on same stream [ROCm/hip commit: 352b17346c]
2019-03-06 17:34:54 +05:30
@@ -89,13 +89,23 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a

 * HIP/ROCm actually updates the start event when the associated kernel completes.
 */
+hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent = nullptr,
+                                    hipEvent_t stopEvent = nullptr,
+                                    uint32_t flags = 0);
+
 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                    hipStream_t hStream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent = nullptr,
-                                    hipEvent_t stopEvent = nullptr);
+                                    hipEvent_t stopEvent = nullptr)
+                                    __attribute__((deprecated("use hipExtModuleLaunchKernel instead")));;

 // doxygen end HCC-specific features
 /**
@@ -88,7 +88,7 @@ int main() {
                      HIP_LAUNCH_PARAM_END};

    HIP_CHECK(
-        hipHccModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));
+        hipExtModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config, 0));

    hipMemcpyDtoH(B, Bd, SIZE);

@@ -130,7 +130,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
-                                  hipEvent_t startEvent, hipEvent_t stopEvent) {
+                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;

@@ -202,8 +202,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
        aql.kernel_object = f->_object;
        aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
        aql.header =
-            (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-            (1 << HSA_PACKET_HEADER_BARRIER);  // TODO - honor queue setting for execute_in_order
+            (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE);
+        if((flags & 0x1)== 0 ) {
+            //in_order
+            aql.header |= (1 << HSA_PACKET_HEADER_BARRIER);
+        }

        if (HCC_OPT_FLUSH) {
            aql.header |= (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -250,9 +253,21 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
                 hStream, kernelParams, extra);
    return ihipLogStatus(ihipModuleLaunchKernel(
        f, blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ, blockDimX, blockDimY,
-        blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr));
+        blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
 }

+hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
+    HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
+                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
+    return ihipLogStatus(ihipModuleLaunchKernel(
+        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
+        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
+}

 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
@@ -264,7 +279,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
    return ihipLogStatus(ihipModuleLaunchKernel(
        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
-        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
+        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
 }

 namespace hip_impl {
@@ -61,8 +61,11 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags, int priorit

            // TODO - se try-catch loop to detect memory exception?
            //
-            // Note this is an execute_in_order queue, so all kernels submitted will atuomatically
-            // wait for prev to complete: This matches CUDA stream behavior:
+            // Note this is an execute_any_order queue, 
+            // CUDA stream behavior is that all kernels submitted will automatically
+            // wait for prev to complete, this behaviour will be mainatined by 
+            // hipModuleLaunchKernel. execute_any_order will help 
+            // hipExtModuleLaunchKernel , which uses a special flag

            {
                // Obtain mutex access to the device critical data, release by destructor
@@ -71,7 +74,7 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags, int priorit
 #if defined(__HCC__) && (__hcc_minor__ < 3)
                auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
 #else
-                auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_in_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags);
+                auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_any_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags);
 #endif

                ctxCrit->addStream(istream);