From de5de9f2b20043e7d4fe2a5e2c8ce54d9c7e467c Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 9 Jun 2015 12:58:27 -0400
Subject: [PATCH] P4 to Git Change 1159349 by rili@rili_opencl_stg on
2015/06/09 12:52:32
EPR #419313 - Blit Kernel execution split
Enabled flag GPU_ENABLE_LARGE_ALLOCATION
Return error if global work size is larger than 32bits.
Code Review#7664
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_execute.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#285 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#111 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#365 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#234 edit
[ROCm/clr commit: ca7f6ab1192fc27e3903d57bb24a540e1417b756]
---
.../rocclr/runtime/device/gpu/gpukernel.cpp | 1 +
.../rocclr/runtime/device/gpu/gpukernel.hpp | 5 +
.../rocclr/runtime/device/gpu/gpuvirtual.cpp | 467 ++++++++++--------
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +-
4 files changed, 262 insertions(+), 213 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index 08abeaab31..623db25dbb 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3561,6 +3561,7 @@ HSAILKernel::init(bool finalize)
//compile kernel down to ISA
if (finalize) {
std::string options(compileOptions_.c_str());
+ flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != 0) ? true: false;
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Append an option so that we can selectively enable a SCOption on CZ
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index b236b4343b..7147128ff9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -889,6 +889,10 @@ public:
bool dynamicParallelism() const
{ return (flags_.dynamicParallelism_) ? true : false; }
+ //! Returns TRUE if kernel is internal kernel
+ bool isInternalKernel() const
+ { return (flags_.internalKernel_) ? true : false; }
+
//! Finds local workgroup size
void findLocalWorkSize(
size_t workDim, //!< Work dimension
@@ -961,6 +965,7 @@ private:
uint imageEna_: 1; //!< Kernel uses images
uint imageWriteEna_: 1; //!< Kernel uses image writes
uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled
+ uint internalKernel_: 1; //!< True: internal kernel
};
uint value_;
Flags(): value_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index f519f0ffe4..193c917de3 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1771,237 +1771,280 @@ VirtualGPU::submitKernelInternalHSA(
dbgManager->allocParamMemList(numParams);
}
- // Program the kernel arguments for the GPU execution
- hsa_kernel_dispatch_packet_t* aqlPkt =
- hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
- vmDefQueue, &vmParentWrap, memList);
- if (NULL == aqlPkt) {
- LogError("Couldn't load kernel arguments");
- return false;
+ size_t newOffset[3] = {0, 0, 0};
+ size_t newGlobalSize[3] = {0, 0, 0};
+
+ int dim = -1;
+ int iteration = 1;
+ size_t globalStep = 0;
+ for (uint i = 0; i < sizes.dimensions(); i++) {
+ newGlobalSize[i] = sizes.global()[i];
+ newOffset[i] = sizes.offset()[i];
+ }
+ // Check if it is blit kernel. If it is, then check if split is needed.
+ if (hsaKernel.isInternalKernel()) {
+ // Calculate new group size for each submission
+ for (uint i = 0; i < sizes.dimensions(); i++) {
+ if (sizes.global()[i] > static_cast(0xffffffff)) {
+ dim = i;
+ iteration = sizes.global()[i] / 0xC0000000
+ + ((sizes.global()[i] % 0xC0000000) ? 1: 0);
+ globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
+ * sizes.local()[dim];
+ break;
+ }
+ }
}
- gslMemObject scratch = NULL;
- uint scratchOffset = 0;
- // Check if the device allocated more registers than the old setup
- if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
- const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
- const std::vector& mems = scratchObj->memObjs_;
- scratch = mems[0]->gslResource();
- memList.push_back(mems[0]);
- scratchOffset = scratchObj->offset_;
- }
-
- // Add GSL handle to the memory list for VidMM
- for (uint i = 0; i < memList.size(); ++i) {
- addVmMemory(memList[i]);
- }
-
- // HW Debug for the kernel?
- HwDbgKernelInfo kernelInfo;
- HwDbgKernelInfo *pKernelInfo = NULL;
-
- if (dbgManager) {
- buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
- pKernelInfo = &kernelInfo;
- }
-
- GpuEvent gpuEvent;
- // Run AQL dispatch in HW
- runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
- scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
-
- if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
- dbgManager->executePostDispatchCallBack();
- }
-
- if (hsaKernel.dynamicParallelism()) {
- // Make sure exculsive access to the device queue
- amd::ScopedLock(defQueue->lock());
-
- if (GPU_PRINT_CHILD_KERNEL != 0) {
- waitForEvent(&gpuEvent);
-
- AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
- uint p = 0;
- for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
- if (wraps[i].state != 0) {
- uint j;
- if (p == GPU_PRINT_CHILD_KERNEL) {
- break;
- }
- p++;
- std::stringstream print;
- print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
- print << "Slot#: " << i << "\n";
- print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
- print << "\tcommand_id: " << wraps[i].command_id << "\n";
- print << "\tchild_counter: " << wraps[i].child_counter << "\n";
- print << "\tcompletion: " << wraps[i].completion << "\n";
- print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
- print << "\twait_list: " << wraps[i].wait_list << "\n";
- print << "\twait_num: " << wraps[i].wait_num << "\n";
- uint offsEvents = wraps[i].wait_list -
- gpuDefQueue->virtualQueue_->vmAddress();
- size_t* events = reinterpret_cast(
- gpuDefQueue->virtualQueue_->data() + offsEvents);
- for (j = 0; j < wraps[i].wait_num; ++j) {
- uint offs = static_cast(events[j]) -
- gpuDefQueue->virtualQueue_->vmAddress();
- AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
- print << "Wait Event#: " << j << "\n";
- print << "\tState: " << eventD->state <<
- "; Counter: " << eventD->counter << "\n";
- }
- print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
- print << wraps[i].aql.workgroup_size_y << ", ";
- print << wraps[i].aql.workgroup_size_z << "]\n";
- print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
- print << wraps[i].aql.grid_size_y << ", ";
- print << wraps[i].aql.grid_size_z << "]\n";
-
- uint64_t* kernels = (uint64_t*)(
- const_cast(hsaKernel.prog().kernelTable())->map(this));
- for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
- if (kernels[j] == wraps[i].aql.kernel_object) {
- break;
- }
- }
- const_cast(hsaKernel.prog().kernelTable())->unmap(this);
- HSAILKernel* child = NULL;
- for (auto it = hsaKernel.prog().kernels().begin();
- it != hsaKernel.prog().kernels().end(); ++it) {
- if (j == static_cast(it->second)->index()) {
- child = static_cast(it->second);
- }
- }
- if (child == NULL) {
- printf("Error: couldn't find child kernel!\n");
- continue;
- }
- const uint64_t kernarg_address =
- static_cast(reinterpret_cast(wraps[i].aql.kernarg_address));
- uint offsArg = kernarg_address -
- gpuDefQueue->virtualQueue_->vmAddress();
- address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
- print << "Kernel: " << child->name() << "\n";
- static const char* Names[HSAILKernel::ExtraArguments] = {
- "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
- for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
- print << "\t" << Names[j] << *(size_t*)argum;
- print << "\n";
- argum += sizeof(size_t);
- }
- for (j = 0; j < child->numArguments(); ++j) {
- print << "\t" << child->argument(j)->name_ << ": ";
- for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
- print.width(2);
- print.fill('0');
- print << (uint32_t)(argum[s]);
- }
- argum += child->argument(j)->size_;
- print << "\n";
- }
- printf("%s", print.str().c_str());
- }
+ for (int j = 0; j < iteration; j++) {
+ // Reset global size for dimension dim if split is needed
+ if (dim != -1) {
+ newOffset[dim] = sizes.offset()[dim] + globalStep * j;
+ if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
+ (j != (iteration - 1))) {
+ newGlobalSize[dim] = globalStep;
+ }
+ else {
+ newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
}
}
- if (!dev().settings().useDeviceQueue_) {
- // Add the termination handshake to the host queue
- virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
- vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- 0, dev().settings().useDeviceQueue_);
+ amd::NDRangeContainer tmpSizes(sizes.dimensions(),
+ &newOffset[0], &newGlobalSize[0],
+ &(const_cast(sizes).local()[0]));
+
+ // Program the kernel arguments for the GPU execution
+ hsa_kernel_dispatch_packet_t* aqlPkt =
+ hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem,
+ vmDefQueue, &vmParentWrap, memList);
+ if (NULL == aqlPkt) {
+ LogError("Couldn't load kernel arguments");
+ return false;
}
- // Get the global loop start before the scheduler
- mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
- static_cast(gpuDefQueue->blitMgr()).runScheduler(
- *gpuDefQueue->virtualQueue_,
- *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
- gpuDefQueue->vqHeader_->aql_slot_num);
- const static bool FlushL2 = true;
- gpuDefQueue->flushCUCaches(FlushL2);
-
- // Get the address of PM4 template and add write it to params
- //! @note DMA flush must not occur between patch and the scheduler
- mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
-
- // Program parameters for the scheduler
- SchedulerParam* param = &reinterpret_cast
- (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
- param->signal = 1;
- // Scale clock to 1024 to avoid 64 bit div in the scheduler
- param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
- param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
- param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
- param->releaseHostCP = 0;
- param->parentAQL = vmParentWrap;
- param->dedicatedQueue = dev().settings().useDeviceQueue_;
-
- // Fill the scratch buffer information
- if (hsaKernel.prog().maxScratchRegs() > 0) {
- gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
- param->scratchSize = scratchBuf->size();
- param->scratch = scratchBuf->vmAddress();
- param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
- param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
- memList.push_back(scratchBuf);
+ gslMemObject scratch = NULL;
+ uint scratchOffset = 0;
+ // Check if the device allocated more registers than the old setup
+ if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
+ const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
+ const std::vector& mems = scratchObj->memObjs_;
+ scratch = mems[0]->gslResource();
+ memList.push_back(mems[0]);
+ scratchOffset = scratchObj->offset_;
}
- else {
- param->numMaxWaves = 0;
- param->scratchSize = 0;
- param->scratch = 0;
- param->scratchOffset = 0;
- }
-
- // Add all kernels in the program to the mem list.
- //! \note Runtime doesn't know which one will be called
- hsaKernel.prog().fillResListWithKernels(memList);
// Add GSL handle to the memory list for VidMM
for (uint i = 0; i < memList.size(); ++i) {
- gpuDefQueue->addVmMemory(memList[i]);
+ addVmMemory(memList[i]);
}
- mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() +
- gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
- gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
- gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
- signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32);
+ // HW Debug for the kernel?
+ HwDbgKernelInfo kernelInfo;
+ HwDbgKernelInfo *pKernelInfo = NULL;
+
+ if (dbgManager) {
+ buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
+ pKernelInfo = &kernelInfo;
+ }
+
+ GpuEvent gpuEvent;
+ // Run AQL dispatch in HW
+ runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
+ scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+
+ if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
+ dbgManager->executePostDispatchCallBack();
+ }
+
+ if (hsaKernel.dynamicParallelism()) {
+ // Make sure exculsive access to the device queue
+ amd::ScopedLock(defQueue->lock());
+
+ if (GPU_PRINT_CHILD_KERNEL != 0) {
+ waitForEvent(&gpuEvent);
+
+ AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
+ uint p = 0;
+ for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
+ if (wraps[i].state != 0) {
+ uint j;
+ if (p == GPU_PRINT_CHILD_KERNEL) {
+ break;
+ }
+ p++;
+ std::stringstream print;
+ print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
+ print << "Slot#: " << i << "\n";
+ print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
+ print << "\tcommand_id: " << wraps[i].command_id << "\n";
+ print << "\tchild_counter: " << wraps[i].child_counter << "\n";
+ print << "\tcompletion: " << wraps[i].completion << "\n";
+ print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
+ print << "\twait_list: " << wraps[i].wait_list << "\n";
+ print << "\twait_num: " << wraps[i].wait_num << "\n";
+ uint offsEvents = wraps[i].wait_list -
+ gpuDefQueue->virtualQueue_->vmAddress();
+ size_t* events = reinterpret_cast(
+ gpuDefQueue->virtualQueue_->data() + offsEvents);
+ for (j = 0; j < wraps[i].wait_num; ++j) {
+ uint offs = static_cast(events[j]) -
+ gpuDefQueue->virtualQueue_->vmAddress();
+ AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
+ print << "Wait Event#: " << j << "\n";
+ print << "\tState: " << eventD->state <<
+ "; Counter: " << eventD->counter << "\n";
+ }
+ print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+ print << wraps[i].aql.workgroup_size_y << ", ";
+ print << wraps[i].aql.workgroup_size_z << "]\n";
+ print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+ print << wraps[i].aql.grid_size_y << ", ";
+ print << wraps[i].aql.grid_size_z << "]\n";
+
+ uint64_t* kernels = (uint64_t*)(
+ const_cast(hsaKernel.prog().kernelTable())->map(this));
+ for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
+ if (kernels[j] == wraps[i].aql.kernel_object) {
+ break;
+ }
+ }
+ const_cast(hsaKernel.prog().kernelTable())->unmap(this);
+ HSAILKernel* child = NULL;
+ for (auto it = hsaKernel.prog().kernels().begin();
+ it != hsaKernel.prog().kernels().end(); ++it) {
+ if (j == static_cast(it->second)->index()) {
+ child = static_cast(it->second);
+ }
+ }
+ if (child == NULL) {
+ printf("Error: couldn't find child kernel!\n");
+ continue;
+ }
+ const uint64_t kernarg_address =
+ static_cast(reinterpret_cast(wraps[i].aql.kernarg_address));
+ uint offsArg = kernarg_address -
+ gpuDefQueue->virtualQueue_->vmAddress();
+ address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
+ print << "Kernel: " << child->name() << "\n";
+ static const char* Names[HSAILKernel::ExtraArguments] = {
+ "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
+ for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
+ print << "\t" << Names[j] << *(size_t*)argum;
+ print << "\n";
+ argum += sizeof(size_t);
+ }
+ for (j = 0; j < child->numArguments(); ++j) {
+ print << "\t" << child->argument(j)->name_ << ": ";
+ for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
+ print.width(2);
+ print.fill('0');
+ print << (uint32_t)(argum[s]);
+ }
+ argum += child->argument(j)->size_;
+ print << "\n";
+ }
+ printf("%s", print.str().c_str());
+ }
+ }
+ }
+
+ if (!dev().settings().useDeviceQueue_) {
+ // Add the termination handshake to the host queue
+ virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+ vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ 0, dev().settings().useDeviceQueue_);
+ }
+
+ // Get the global loop start before the scheduler
+ mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
+ static_cast(gpuDefQueue->blitMgr()).runScheduler(
+ *gpuDefQueue->virtualQueue_,
+ *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+ gpuDefQueue->vqHeader_->aql_slot_num);
+ const static bool FlushL2 = true;
+ gpuDefQueue->flushCUCaches(FlushL2);
+
+ // Get the address of PM4 template and add write it to params
+ //! @note DMA flush must not occur between patch and the scheduler
+ mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
+
+ // Program parameters for the scheduler
+ SchedulerParam* param = &reinterpret_cast
+ (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
+ param->signal = 1;
+ // Scale clock to 1024 to avoid 64 bit div in the scheduler
+ param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
+ param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
+ param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
+ param->releaseHostCP = 0;
+ param->parentAQL = vmParentWrap;
+ param->dedicatedQueue = dev().settings().useDeviceQueue_;
+
+ // Fill the scratch buffer information
+ if (hsaKernel.prog().maxScratchRegs() > 0) {
+ gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
+ param->scratchSize = scratchBuf->size();
+ param->scratch = scratchBuf->vmAddress();
+ param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
+ param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
+ memList.push_back(scratchBuf);
+ }
+ else {
+ param->numMaxWaves = 0;
+ param->scratchSize = 0;
+ param->scratch = 0;
+ param->scratchOffset = 0;
+ }
+
+ // Add all kernels in the program to the mem list.
+ //! \note Runtime doesn't know which one will be called
+ hsaKernel.prog().fillResListWithKernels(memList);
+
+ // Add GSL handle to the memory list for VidMM
+ for (uint i = 0; i < memList.size(); ++i) {
+ gpuDefQueue->addVmMemory(memList[i]);
+ }
+
+ mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() +
+ gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+ gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
+ gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
+ signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32);
+
+ // Set GPU event for the used resources
+ for (uint i = 0; i < memList.size(); ++i) {
+ memList[i]->setBusy(*gpuDefQueue, gpuEvent);
+ }
+
+ if (dev().settings().useDeviceQueue_) {
+ // Add the termination handshake to the host queue
+ virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+ vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ signalAddr, dev().settings().useDeviceQueue_);
+ }
+
+ ++gpuDefQueue->schedParamIdx_ %=
+ gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
+ //! \todo optimize the wrap around
+ if (gpuDefQueue->schedParamIdx_ == 0) {
+ gpuDefQueue->schedParams_->wait(*gpuDefQueue);
+ }
+ }
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
- memList[i]->setBusy(*gpuDefQueue, gpuEvent);
+ memList[i]->setBusy(*this, gpuEvent);
}
- if (dev().settings().useDeviceQueue_) {
- // Add the termination handshake to the host queue
- virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
- vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- signalAddr, dev().settings().useDeviceQueue_);
+ // Update the global GPU event
+ setGpuEvent(gpuEvent);
+
+ if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
+ LogError("Couldn't read printf data from the buffer!\n");
+ return false;
}
-
- ++gpuDefQueue->schedParamIdx_ %=
- gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
- //! \todo optimize the wrap around
- if (gpuDefQueue->schedParamIdx_ == 0) {
- gpuDefQueue->schedParams_->wait(*gpuDefQueue);
- }
- }
-
- // Set GPU event for the used resources
- for (uint i = 0; i < memList.size(); ++i) {
- memList[i]->setBusy(*this, gpuEvent);
- }
-
- // Update the global GPU event
- setGpuEvent(gpuEvent);
-
- if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
- LogError("Couldn't read printf data from the buffer!\n");
- return false;
}
// Runtime submitted a HSAIL kernel
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index beb76aa56f..cf941095bd 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -154,7 +154,7 @@ release(bool, GPU_DIRECT_SRD, false, \
"Use indirect SRD access in HSAIL") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
-release(bool, GPU_ENABLE_LARGE_ALLOCATION, false, \
+release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \
"Enable >4GB single allocations") \
release(bool, AMD_THREAD_TRACE_ENABLE, !IS_MAINLINE, \
"Enable thread trace extension") \