diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index fd0008264b..13aee5af1f 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -61,9 +61,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
}
if (zero && !prog.isInternal()) {
- char pattern = 0;
- prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, sizeof(pattern), amd::Coord3D(0),
- amd::Coord3D(size));
+ uint64_t pattern = 0;
+ size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
+ prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
+ amd::Coord3D(0), amd::Coord3D(size));
}
switch (segment) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 01c81917fa..206d4f5f15 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -590,17 +590,10 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
if (deviceQueueSize_ == deviceQueueSize) {
return true;
} else {
- //! @todo Temporarily keep the buffer mapped for debug purpose
- if (nullptr != schedParams_) {
- schedParams_->unmap(this);
- }
delete vqHeader_;
delete virtualQueue_;
- delete schedParams_;
vqHeader_ = nullptr;
virtualQueue_ = nullptr;
- schedParams_ = nullptr;
- schedParamIdx_ = 0;
deviceQueueSize_ = 0;
}
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
@@ -681,13 +674,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
offset += sizeof(AmdAqlWrap);
}
- schedParams_ = new Memory(dev(), 64 * Ki);
- if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) {
- return false;
- }
-
- address ptr = reinterpret_cast
(schedParams_->map(this));
-
deviceQueueSize_ = deviceQueueSize;
return true;
@@ -710,7 +696,6 @@ VirtualGPU::VirtualGPU(Device& device)
vqHeader_(nullptr),
virtualQueue_(nullptr),
schedParams_(nullptr),
- schedParamIdx_(0),
deviceQueueSize_(0),
maskGroups_(1),
hsaQueueMem_(nullptr),
@@ -894,26 +879,21 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
bool VirtualGPU::allocHsaQueueMem() {
// Allocate a dummy HSA queue
hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
- if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::RemoteUSWC))) {
+ if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) {
delete hsaQueueMem_;
return false;
}
- amd_queue_t* queue =
- reinterpret_cast(hsaQueueMem_->map(nullptr, Resource::WriteOnly));
- if (nullptr == queue) {
- delete hsaQueueMem_;
- return false;
- }
- memset(queue, 0, sizeof(amd_queue_t));
+ amd_queue_t hsa_queue = {};
// Provide private and local heap addresses
- const static uint addressShift = LP64_SWITCH(0, 32);
- queue->private_segment_aperture_base_hi = static_cast(
+ constexpr uint addressShift = LP64_SWITCH(0, 32);
+ hsa_queue.private_segment_aperture_base_hi = static_cast(
dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
- queue->group_segment_aperture_base_hi = static_cast(
+ hsa_queue.group_segment_aperture_base_hi = static_cast(
dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);
- hsaQueueMem_->unmap(nullptr);
+ hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true);
+
return true;
}
@@ -946,14 +926,8 @@ VirtualGPU::~VirtualGPU() {
managedBuffer_.release();
- //! @todo Temporarily keep the buffer mapped for debug purpose
- if (nullptr != schedParams_) {
- schedParams_->unmap(this);
- }
-
delete vqHeader_;
delete virtualQueue_;
- delete schedParams_;
delete hsaQueueMem_;
// Release scratch buffer memory to reduce memory pressure
@@ -1969,6 +1943,9 @@ bool VirtualGPU::PreDeviceEnqueue(
*vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();
(*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress());
+ // Acquire USWC memory for the scheduler parameters
+ (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));
+
// Add memory handles before the actual dispatch
addVmMemory((*gpuDefQueue)->virtualQueue_);
addVmMemory((*gpuDefQueue)->schedParams_);
@@ -1990,6 +1967,7 @@ void VirtualGPU::PostDeviceEnqueue(
// Make sure exculsive access to the device queue
amd::ScopedLock(defQueue->lock());
+ Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam));
if (GPU_PRINT_CHILD_KERNEL != 0) {
waitForEvent(gpuEvent);
@@ -2008,8 +1986,7 @@ void VirtualGPU::PostDeviceEnqueue(
// Get the global loop start before the scheduler
Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
static_cast(gpuDefQueue->blitMgr())
- .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_,
- gpuDefQueue->schedParamIdx_,
+ .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->addBarrier(FlushL2);
@@ -2018,8 +1995,8 @@ void VirtualGPU::PostDeviceEnqueue(
//! @note DMA flush must not occur between patch and the scheduler
Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
- SchedulerParam* param = &reinterpret_cast(
- gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
+ SchedulerParam* param = reinterpret_cast(
+ gpuDefQueue->schedParams_->data());
param->signal = 1;
// Scale clock to 1024 to avoid 64 bit div in the scheduler
param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2050,8 +2027,7 @@ void VirtualGPU::PostDeviceEnqueue(
//! \note Runtime doesn't know which one will be called
hsaKernel.prog().fillResListWithKernels(*this);
- Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
- gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+ Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
gpuDefQueue->eventBegin(MainEngine);
gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
signalAddr, loopStart,
@@ -2073,11 +2049,8 @@ void VirtualGPU::PostDeviceEnqueue(
eventEnd(MainEngine, *gpuEvent);
}
- ++gpuDefQueue->schedParamIdx_ %= gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
- //! \todo optimize the wrap around
- if (gpuDefQueue->schedParamIdx_ == 0) {
- gpuDefQueue->schedParams_->wait(*gpuDefQueue);
- }
+ xferWrite().Release(*gpuDefQueue->schedParams_);
+ gpuDefQueue->schedParams_ = nullptr;
}
// ================================================================================================
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 71ca26746c..2d73accdf0 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -613,7 +613,6 @@ class VirtualGPU : public device::VirtualDevice {
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
- uint schedParamIdx_; //!< Index in the scheduler parameters buffer
uint deviceQueueSize_; //!< Device queue size
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread