From de5de9f2b20043e7d4fe2a5e2c8ce54d9c7e467c Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 9 Jun 2015 12:58:27 -0400
Subject: [PATCH] P4 to Git Change 1159349 by rili@rili_opencl_stg on
 2015/06/09 12:52:32

	EPR #419313 - Blit Kernel execution split
	                         Enabled flag GPU_ENABLE_LARGE_ALLOCATION
	                         Return error if global work size is larger than 32bits.

	Code Review#7664

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_execute.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#285 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#111 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#365 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#234 edit


[ROCm/clr commit: ca7f6ab1192fc27e3903d57bb24a540e1417b756]
---
 .../rocclr/runtime/device/gpu/gpukernel.cpp   |   1 +
 .../rocclr/runtime/device/gpu/gpukernel.hpp   |   5 +
 .../rocclr/runtime/device/gpu/gpuvirtual.cpp  | 467 ++++++++++--------
 projects/clr/rocclr/runtime/utils/flags.hpp   |   2 +-
 4 files changed, 262 insertions(+), 213 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index 08abeaab31..623db25dbb 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3561,6 +3561,7 @@ HSAILKernel::init(bool finalize)
     //compile kernel down to ISA
     if (finalize) {
         std::string options(compileOptions_.c_str());
+        flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != 0) ? true: false;
         options.append(" -just-kernel=");
         options.append(openClKernelName.c_str());
         // Append an option so that we can selectively enable a SCOption on CZ
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index b236b4343b..7147128ff9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -889,6 +889,10 @@ public:
     bool dynamicParallelism() const
         { return (flags_.dynamicParallelism_) ? true : false; }
 
+    //! Returns TRUE if kernel is internal kernel
+    bool isInternalKernel() const
+        { return (flags_.internalKernel_) ? true : false; }
+
     //! Finds local workgroup size
     void findLocalWorkSize(
         size_t      workDim,            //!< Work dimension
@@ -961,6 +965,7 @@ private:
             uint    imageEna_: 1;           //!< Kernel uses images
             uint    imageWriteEna_: 1;      //!< Kernel uses image writes
             uint    dynamicParallelism_: 1; //!< Dynamic parallelism enabled
+            uint    internalKernel_: 1;     //!< True: internal kernel
         };
         uint    value_;
         Flags(): value_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index f519f0ffe4..193c917de3 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1771,237 +1771,280 @@ VirtualGPU::submitKernelInternalHSA(
         dbgManager->allocParamMemList(numParams);
     }
 
-    // Program the kernel arguments for the GPU execution
-    hsa_kernel_dispatch_packet_t*  aqlPkt =
-        hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
-        vmDefQueue, &vmParentWrap, memList);
-    if (NULL == aqlPkt) {
-        LogError("Couldn't load kernel arguments");
-        return false;
+    size_t newOffset[3] = {0, 0, 0};
+    size_t newGlobalSize[3] = {0, 0, 0};
+
+    int dim = -1;
+    int iteration = 1;
+    size_t globalStep = 0;
+    for (uint i = 0; i < sizes.dimensions(); i++) {
+        newGlobalSize[i] = sizes.global()[i];
+        newOffset[i] = sizes.offset()[i];
+    }
+    // Check if it is blit kernel. If it is, then check if split is needed.
+    if (hsaKernel.isInternalKernel()) {
+        // Calculate new group size for each submission
+        for (uint i = 0; i < sizes.dimensions(); i++) {
+            if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
+                dim = i;
+                iteration = sizes.global()[i] / 0xC0000000
+                            + ((sizes.global()[i] % 0xC0000000) ? 1: 0);
+                globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
+                             * sizes.local()[dim];
+                break;
+            }
+        }
     }
 
-    gslMemObject    scratch = NULL;
-    uint            scratchOffset = 0;
-    // Check if the device allocated more registers than the old setup
-    if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
-        const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
-        const std::vector<Memory*>& mems = scratchObj->memObjs_;
-        scratch = mems[0]->gslResource();
-        memList.push_back(mems[0]);
-        scratchOffset = scratchObj->offset_;
-    }
-
-    // Add GSL handle to the memory list for VidMM
-    for (uint i = 0; i < memList.size(); ++i) {
-        addVmMemory(memList[i]);
-    }
-
-    // HW Debug for the kernel?
-    HwDbgKernelInfo kernelInfo;
-    HwDbgKernelInfo *pKernelInfo = NULL;
-
-    if (dbgManager) {
-        buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
-        pKernelInfo = &kernelInfo;
-    }
-
-    GpuEvent    gpuEvent;
-    // Run AQL dispatch in HW
-    runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
-        scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
-
-    if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
-        dbgManager->executePostDispatchCallBack();
-    }
-
-    if (hsaKernel.dynamicParallelism()) {
-        // Make sure exculsive access to the device queue
-        amd::ScopedLock(defQueue->lock());
-
-        if (GPU_PRINT_CHILD_KERNEL != 0) {
-            waitForEvent(&gpuEvent);
-
-            AmdAqlWrap* wraps =  (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
-            uint p = 0;
-            for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
-                if (wraps[i].state != 0) {
-                    uint j;
-                    if (p == GPU_PRINT_CHILD_KERNEL) {
-                        break;
-                    }
-                    p++;
-                    std::stringstream print;
-                    print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
-                    print << "Slot#: "  << i << "\n";
-                    print << "\tenqueue_flags: "  << wraps[i].enqueue_flags   << "\n";
-                    print << "\tcommand_id: "     << wraps[i].command_id      << "\n";
-                    print << "\tchild_counter: "  << wraps[i].child_counter   << "\n";
-                    print << "\tcompletion: "     << wraps[i].completion      << "\n";
-                    print << "\tparent_wrap: "    << wraps[i].parent_wrap     << "\n";
-                    print << "\twait_list: "      << wraps[i].wait_list       << "\n";
-                    print << "\twait_num: "       << wraps[i].wait_num        << "\n";
-                    uint offsEvents = wraps[i].wait_list -
-                        gpuDefQueue->virtualQueue_->vmAddress();
-                    size_t* events = reinterpret_cast<size_t*>(
-                        gpuDefQueue->virtualQueue_->data() + offsEvents);
-                    for (j = 0; j < wraps[i].wait_num; ++j) {
-                        uint offs = static_cast<uint64_t>(events[j]) -
-                            gpuDefQueue->virtualQueue_->vmAddress();
-                        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
-                        print << "Wait Event#: " << j << "\n";
-                        print << "\tState: " << eventD->state <<
-                                 "; Counter: " << eventD->counter << "\n";
-                    }
-                    print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
-                    print << wraps[i].aql.workgroup_size_y << ", ";
-                    print << wraps[i].aql.workgroup_size_z << "]\n";
-                    print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
-                    print << wraps[i].aql.grid_size_y << ", ";
-                    print << wraps[i].aql.grid_size_z << "]\n";
-
-                    uint64_t* kernels = (uint64_t*)(
-                        const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
-                    for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
-                        if (kernels[j] == wraps[i].aql.kernel_object) {
-                            break;
-                        }
-                    }
-                    const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
-                    HSAILKernel* child = NULL;
-                    for (auto it = hsaKernel.prog().kernels().begin();
-                         it != hsaKernel.prog().kernels().end(); ++it) {
-                        if (j == static_cast<HSAILKernel*>(it->second)->index()) {
-                            child = static_cast<HSAILKernel*>(it->second);
-                        }
-                    }
-                    if (child == NULL) {
-                        printf("Error: couldn't find child kernel!\n");
-                        continue;
-                    }
-                    const uint64_t kernarg_address =
-                      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
-                    uint offsArg = kernarg_address -
-                        gpuDefQueue->virtualQueue_->vmAddress();
-                    address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
-                    print << "Kernel: " << child->name() << "\n";
-                    static const char* Names[HSAILKernel::ExtraArguments] = {
-                    "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
-                    for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
-                        print << "\t" << Names[j] << *(size_t*)argum;
-                        print << "\n";
-                        argum += sizeof(size_t);
-                    }
-                    for (j = 0; j < child->numArguments(); ++j) {
-                        print << "\t" << child->argument(j)->name_ << ": ";
-                        for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
-                            print.width(2);
-                            print.fill('0');
-                            print << (uint32_t)(argum[s]);
-                        }
-                        argum += child->argument(j)->size_;
-                        print << "\n";
-                    }
-                    printf("%s", print.str().c_str());
-                }
+    for (int j = 0; j < iteration; j++) {
+        // Reset global size for dimension dim if split is needed
+        if (dim != -1) {
+            newOffset[dim] = sizes.offset()[dim] + globalStep * j;
+            if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
+                (j != (iteration - 1))) {
+                newGlobalSize[dim] = globalStep;
+            }
+            else {
+                newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
             }
         }
 
-        if (!dev().settings().useDeviceQueue_) {
-            // Add the termination handshake to the host queue
-            virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
-                vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                0, dev().settings().useDeviceQueue_);
+        amd::NDRangeContainer  tmpSizes(sizes.dimensions(),
+            &newOffset[0], &newGlobalSize[0],
+            &(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
+
+        // Program the kernel arguments for the GPU execution
+        hsa_kernel_dispatch_packet_t*  aqlPkt =
+            hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem,
+            vmDefQueue, &vmParentWrap, memList);
+        if (NULL == aqlPkt) {
+            LogError("Couldn't load kernel arguments");
+            return false;
         }
 
-        // Get the global loop start before the scheduler
-        mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
-        static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
-            *gpuDefQueue->virtualQueue_,
-            *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
-            gpuDefQueue->vqHeader_->aql_slot_num);
-        const static bool FlushL2 = true;
-        gpuDefQueue->flushCUCaches(FlushL2);
-
-        // Get the address of PM4 template and add write it to params
-        //! @note DMA flush must not occur between patch and the scheduler
-        mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
-
-        // Program parameters for the scheduler
-        SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
-            (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
-        param->signal = 1;
-        // Scale clock to 1024 to avoid 64 bit div in the scheduler
-        param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
-        param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
-        param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
-        param->releaseHostCP = 0;
-        param->parentAQL = vmParentWrap;
-        param->dedicatedQueue = dev().settings().useDeviceQueue_;
-
-        // Fill the scratch buffer information
-        if (hsaKernel.prog().maxScratchRegs() > 0) {
-            gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
-            param->scratchSize = scratchBuf->size();
-            param->scratch = scratchBuf->vmAddress();
-            param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
-            param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
-            memList.push_back(scratchBuf);
+        gslMemObject    scratch = NULL;
+        uint            scratchOffset = 0;
+        // Check if the device allocated more registers than the old setup
+        if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
+            const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
+            const std::vector<Memory*>& mems = scratchObj->memObjs_;
+            scratch = mems[0]->gslResource();
+            memList.push_back(mems[0]);
+            scratchOffset = scratchObj->offset_;
         }
-        else {
-            param->numMaxWaves = 0;
-            param->scratchSize = 0;
-            param->scratch = 0;
-            param->scratchOffset = 0;
-        }
-
-        // Add all kernels in the program to the mem list.
-        //! \note Runtime doesn't know which one will be called
-        hsaKernel.prog().fillResListWithKernels(memList);
 
         // Add GSL handle to the memory list for VidMM
         for (uint i = 0; i < memList.size(); ++i) {
-            gpuDefQueue->addVmMemory(memList[i]);
+            addVmMemory(memList[i]);
         }
 
-        mcaddr  signalAddr = gpuDefQueue->schedParams_->vmAddress() +
-            gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
-        gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
-            gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
-            signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32);
+        // HW Debug for the kernel?
+        HwDbgKernelInfo kernelInfo;
+        HwDbgKernelInfo *pKernelInfo = NULL;
+
+        if (dbgManager) {
+            buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
+            pKernelInfo = &kernelInfo;
+        }
+
+        GpuEvent    gpuEvent;
+        // Run AQL dispatch in HW
+        runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
+            scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+
+        if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
+            dbgManager->executePostDispatchCallBack();
+        }
+
+        if (hsaKernel.dynamicParallelism()) {
+            // Make sure exculsive access to the device queue
+            amd::ScopedLock(defQueue->lock());
+
+            if (GPU_PRINT_CHILD_KERNEL != 0) {
+                waitForEvent(&gpuEvent);
+
+                AmdAqlWrap* wraps =  (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
+                uint p = 0;
+                for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
+                    if (wraps[i].state != 0) {
+                        uint j;
+                        if (p == GPU_PRINT_CHILD_KERNEL) {
+                            break;
+                        }
+                        p++;
+                        std::stringstream print;
+                        print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
+                        print << "Slot#: "  << i << "\n";
+                        print << "\tenqueue_flags: "  << wraps[i].enqueue_flags   << "\n";
+                        print << "\tcommand_id: "     << wraps[i].command_id      << "\n";
+                        print << "\tchild_counter: "  << wraps[i].child_counter   << "\n";
+                        print << "\tcompletion: "     << wraps[i].completion      << "\n";
+                        print << "\tparent_wrap: "    << wraps[i].parent_wrap     << "\n";
+                        print << "\twait_list: "      << wraps[i].wait_list       << "\n";
+                        print << "\twait_num: "       << wraps[i].wait_num        << "\n";
+                        uint offsEvents = wraps[i].wait_list -
+                            gpuDefQueue->virtualQueue_->vmAddress();
+                        size_t* events = reinterpret_cast<size_t*>(
+                            gpuDefQueue->virtualQueue_->data() + offsEvents);
+                        for (j = 0; j < wraps[i].wait_num; ++j) {
+                            uint offs = static_cast<uint64_t>(events[j]) -
+                                gpuDefQueue->virtualQueue_->vmAddress();
+                            AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
+                            print << "Wait Event#: " << j << "\n";
+                            print << "\tState: " << eventD->state <<
+                                     "; Counter: " << eventD->counter << "\n";
+                        }
+                        print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+                        print << wraps[i].aql.workgroup_size_y << ", ";
+                        print << wraps[i].aql.workgroup_size_z << "]\n";
+                        print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+                        print << wraps[i].aql.grid_size_y << ", ";
+                        print << wraps[i].aql.grid_size_z << "]\n";
+
+                        uint64_t* kernels = (uint64_t*)(
+                            const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
+                        for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
+                            if (kernels[j] == wraps[i].aql.kernel_object) {
+                                break;
+                            }
+                        }
+                        const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
+                        HSAILKernel* child = NULL;
+                        for (auto it = hsaKernel.prog().kernels().begin();
+                             it != hsaKernel.prog().kernels().end(); ++it) {
+                            if (j == static_cast<HSAILKernel*>(it->second)->index()) {
+                                child = static_cast<HSAILKernel*>(it->second);
+                            }
+                        }
+                        if (child == NULL) {
+                            printf("Error: couldn't find child kernel!\n");
+                            continue;
+                        }
+                        const uint64_t kernarg_address =
+                          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
+                        uint offsArg = kernarg_address -
+                            gpuDefQueue->virtualQueue_->vmAddress();
+                        address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
+                        print << "Kernel: " << child->name() << "\n";
+                        static const char* Names[HSAILKernel::ExtraArguments] = {
+                        "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
+                        for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
+                            print << "\t" << Names[j] << *(size_t*)argum;
+                            print << "\n";
+                            argum += sizeof(size_t);
+                        }
+                        for (j = 0; j < child->numArguments(); ++j) {
+                            print << "\t" << child->argument(j)->name_ << ": ";
+                            for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
+                                print.width(2);
+                                print.fill('0');
+                                print << (uint32_t)(argum[s]);
+                            }
+                            argum += child->argument(j)->size_;
+                            print << "\n";
+                        }
+                        printf("%s", print.str().c_str());
+                    }
+                }
+            }
+
+            if (!dev().settings().useDeviceQueue_) {
+                // Add the termination handshake to the host queue
+                virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    0, dev().settings().useDeviceQueue_);
+            }
+
+            // Get the global loop start before the scheduler
+            mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
+            static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
+                *gpuDefQueue->virtualQueue_,
+                *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+                gpuDefQueue->vqHeader_->aql_slot_num);
+            const static bool FlushL2 = true;
+            gpuDefQueue->flushCUCaches(FlushL2);
+
+            // Get the address of PM4 template and add write it to params
+            //! @note DMA flush must not occur between patch and the scheduler
+            mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
+
+            // Program parameters for the scheduler
+            SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
+                (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
+            param->signal = 1;
+            // Scale clock to 1024 to avoid 64 bit div in the scheduler
+            param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
+            param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
+            param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
+            param->releaseHostCP = 0;
+            param->parentAQL = vmParentWrap;
+            param->dedicatedQueue = dev().settings().useDeviceQueue_;
+
+            // Fill the scratch buffer information
+            if (hsaKernel.prog().maxScratchRegs() > 0) {
+                gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
+                param->scratchSize = scratchBuf->size();
+                param->scratch = scratchBuf->vmAddress();
+                param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
+                param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
+                memList.push_back(scratchBuf);
+            }
+            else {
+                param->numMaxWaves = 0;
+                param->scratchSize = 0;
+                param->scratch = 0;
+                param->scratchOffset = 0;
+            }
+
+            // Add all kernels in the program to the mem list.
+            //! \note Runtime doesn't know which one will be called
+            hsaKernel.prog().fillResListWithKernels(memList);
+
+            // Add GSL handle to the memory list for VidMM
+            for (uint i = 0; i < memList.size(); ++i) {
+                gpuDefQueue->addVmMemory(memList[i]);
+            }
+
+            mcaddr  signalAddr = gpuDefQueue->schedParams_->vmAddress() +
+                gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+            gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
+                gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
+                signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32);
+
+            // Set GPU event for the used resources
+            for (uint i = 0; i < memList.size(); ++i) {
+                memList[i]->setBusy(*gpuDefQueue, gpuEvent);
+            }
+
+            if (dev().settings().useDeviceQueue_) {
+                // Add the termination handshake to the host queue
+                virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    signalAddr, dev().settings().useDeviceQueue_);
+            }
+
+            ++gpuDefQueue->schedParamIdx_ %=
+                gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
+            //! \todo optimize the wrap around
+            if (gpuDefQueue->schedParamIdx_ == 0) {
+                gpuDefQueue->schedParams_->wait(*gpuDefQueue);
+            }
+        }
 
         // Set GPU event for the used resources
         for (uint i = 0; i < memList.size(); ++i) {
-            memList[i]->setBusy(*gpuDefQueue, gpuEvent);
+            memList[i]->setBusy(*this, gpuEvent);
         }
 
-        if (dev().settings().useDeviceQueue_) {
-            // Add the termination handshake to the host queue
-            virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
-                vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                signalAddr, dev().settings().useDeviceQueue_);
+        // Update the global GPU event
+        setGpuEvent(gpuEvent);
+
+        if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
+            LogError("Couldn't read printf data from the buffer!\n");
+            return false;
         }
-
-        ++gpuDefQueue->schedParamIdx_ %=
-            gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
-        //! \todo optimize the wrap around
-        if (gpuDefQueue->schedParamIdx_ == 0) {
-            gpuDefQueue->schedParams_->wait(*gpuDefQueue);
-        }
-    }
-
-    // Set GPU event for the used resources
-    for (uint i = 0; i < memList.size(); ++i) {
-        memList[i]->setBusy(*this, gpuEvent);
-    }
-
-    // Update the global GPU event
-    setGpuEvent(gpuEvent);
-
-    if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
-        LogError("Couldn't read printf data from the buffer!\n");
-        return false;
     }
 
     // Runtime submitted a HSAIL kernel
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index beb76aa56f..cf941095bd 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -154,7 +154,7 @@ release(bool, GPU_DIRECT_SRD, false,                                          \
         "Use indirect SRD access in HSAIL")                                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
-release(bool, GPU_ENABLE_LARGE_ALLOCATION, false,                             \
+release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                             \
         "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, !IS_MAINLINE,                          \
         "Enable thread trace extension")                                      \