diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index 5d5ef73471..66d02aba9d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData( const void* data ) const { - static_cast(memory).writeRawData(gpu(), size, data, false); + static_cast(memory).writeRawData(gpu(), 0, size, data, false); synchronize(); } diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index ab38c8059c..608f676088 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -706,6 +706,7 @@ Device::create(Pal::IDevice* device) // palSettings ... palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); + palSettings->cmdBufBatchedSubmitChainLimit = 0; // Commit the new settings for the device result = iDev()->CommitSettingsAndInit(); diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp index 40d902b377..1d81036668 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp @@ -619,7 +619,7 @@ PrintfDbgHSA::init( // Copy offset and number of bytes available for printf data // into the corresponding location in the debug buffer - dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); + dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); } return true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index 54e56454b2..b6f47d0f0e 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -1072,9 +1072,10 @@ Resource::free() void Resource::writeRawData( VirtualGPU& gpu, - size_t size, + size_t offset, + size_t size, const void* data, - bool waitForEvent) const + bool waitForEvent) const { GpuEvent event; @@ -1082,11 +1083,8 @@ Resource::writeRawData( // size needs to be DWORD aligned assert((size & 3) == 0); gpu.eventBegin(MainEngine); - //! @todo Remove cache flush - //! It's a workaround for a PAL crash with embedded data, allocated before any command - gpu.flushCUCaches(); gpu.queue(MainEngine).addCmdMemRef(iMem()); - gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast(data)); + gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); setBusy(gpu, event); @@ -1938,7 +1936,7 @@ Resource::warmUpRenames(VirtualGPU& gpu) uint dummy = 0; const bool NoWait = false; // Write 0 for the buffer paging by VidMM - writeRawData(gpu, sizeof(dummy), &dummy, NoWait); + writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait); const bool Force = true; rename(gpu, Force); } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp index b1fac5b20c..9687674c6b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -240,10 +240,11 @@ public: * */ void writeRawData( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) - const void* data, //!< Data to be copied - bool waitForEvent //!< Wait for event complete + VirtualGPU& gpu, //!< Virtual GPU device object + size_t offset, //!< Offset for in the buffer for data + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete ) const; //! Returns the offset in GPU memory for aliases diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 697ed8bb3c..e5beeac673 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) } uint -VirtualGPU::Queue::submit() +VirtualGPU::Queue::submit(bool forceFlush) { cmdCnt_++; uint id = cmdBufIdCurrent_; - if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) { + if ((cmdCnt_ > MaxCommands) || forceFlush) { if (!flush()) { return GpuEvent::InvalidID; } @@ -238,6 +238,11 @@ VirtualGPU::Queue::flush() return false; } + // Reset command buffer, so CB chunks could be reused + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) { + LogError("PAL failed CB reset!"); + return false; + } // Start command buffer building Pal::CmdBufferBuildInfo cmdBuildInfo = {}; if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { @@ -591,41 +596,44 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { return false; } - address ptr = reinterpret_cast
( - virtualQueue_->map(this, Resource::WriteOnly)); - if (nullptr == ptr) { - return false; - } - // Clear memory - memset(ptr, 0, allocSize); - uint64_t vaBase = virtualQueue_->vmAddress(); - AmdVQueueHeader* header = reinterpret_cast(ptr); + if (GPU_PRINT_CHILD_KERNEL != 0) { + address ptr = reinterpret_cast
( + virtualQueue_->map(this, Resource::WriteOnly)); + if (nullptr == ptr) { + return false; + } + } + + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader header = {}; // Initialize the virtual queue header - header->aql_slot_num = numSlots; - header->event_slot_num = dev().settings().numDeviceEvents_; - header->event_slot_mask = vaBase + eventMaskOffs; - header->event_slots = vaBase + eventsOffs; - header->aql_slot_mask = vaBase + slotMaskOffs; - header->wait_size = dev().settings().numWaitEvents_; - header->arg_size = dev().info().maxParameterSize_ + 64; - header->mask_groups = maskGroups_; + header.aql_slot_num = numSlots; + header.event_slot_num = dev().settings().numDeviceEvents_; + header.event_slot_mask = vaBase + eventMaskOffs; + header.event_slots = vaBase + eventsOffs; + header.aql_slot_mask = vaBase + slotMaskOffs; + header.wait_size = dev().settings().numWaitEvents_; + header.arg_size = dev().info().maxParameterSize_ + 64; + header.mask_groups = maskGroups_; + vqHeader_ = new AmdVQueueHeader; if (nullptr == vqHeader_) { return false; } - *vqHeader_ = *header; + *vqHeader_ = header; + + virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); // Go over all slots and perform initialization - AmdAqlWrap* slots = reinterpret_cast(&header[1]); + AmdAqlWrap slot = {}; + size_t offset = sizeof(AmdVQueueHeader); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slots[i].aql.kernarg_address = reinterpret_cast(argStart); - slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; - } - // Upload data back to local memory - if (GPU_PRINT_CHILD_KERNEL == 0) { - virtualQueue_->unmap(this); + slot.aql.kernarg_address = reinterpret_cast(argStart); + slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; + virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); + offset += sizeof(AmdAqlWrap); } schedParams_ = new Memory(dev(), 64 * Ki); @@ -633,7 +641,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) return false; } - ptr = reinterpret_cast
(schedParams_->map(this)); + address ptr = reinterpret_cast
(schedParams_->map(this)); deviceQueueSize_ = deviceQueueSize; @@ -689,9 +697,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) state_.profiling_ = profiling; Pal::CmdAllocatorCreateInfo createInfo = {}; + createInfo.flags.threadSafe = true; // \todo forces PAL to reuse CBs, but requires postamble - createInfo.flags.autoMemoryReuse = true; - createInfo.flags.threadSafe = false; + createInfo.flags.autoMemoryReuse = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartCacheable; createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; @@ -2100,34 +2108,28 @@ VirtualGPU::submitKernelInternal( } if (!dev().settings().useDeviceQueue_) { - Unimplemented(); -/* // Add the termination handshake to the host queue eventBegin(MainEngine); - cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - 0, dev().settings().useDeviceQueue_); + //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), + // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + // vmParentWrap + offsetof(AmdAqlWrap, child_counter), + // 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); -*/ } // Get the global loop start before the scheduler - Unimplemented(); -/* - mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); - static_cast(gpuDefQueue->blitMgr()).runScheduler( - *gpuDefQueue->virtualQueue_, - *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); + //static_cast(gpuDefQueue->blitMgr()).runScheduler( + // *gpuDefQueue->virtualQueue_, + // *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + // gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->flushCUCaches(FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler - mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); -*/ Pal::gpusize patchStart = 0; + //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; @@ -2168,31 +2170,28 @@ VirtualGPU::submitKernelInternal( Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); - Unimplemented(); -/* gpuDefQueue->eventBegin(MainEngine); - gpuDefQueue->cs()->VirtualQueueDispatcherEnd( - gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, - signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - (DeviceQueueMaskSize * maskGroups_)); - gpuDefQueue->eventEnd(MainEngine, gpuEvent); -*/ + //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( + // signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / + // (DeviceQueueMaskSize * maskGroups_)); + // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. + // Thus TS command for profiling has to follow in the next CB. + constexpr bool ForceSubmitFirst = true; + gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst); + // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*gpuDefQueue, gpuEvent); } if (dev().settings().useDeviceQueue_) { - Unimplemented(); -/* // Add the termination handshake to the host queue eventBegin(MainEngine); - cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr, dev().settings().useDeviceQueue_); + //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), + // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + // vmParentWrap + offsetof(AmdAqlWrap, child_counter), + // signalAddr, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); -*/ } ++gpuDefQueue->schedParamIdx_ %= @@ -3250,7 +3249,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { const static bool Wait = true; vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); + virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); } void diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index c566a3f646..5f9a73d303 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -79,7 +79,7 @@ public: //! Flushes the current command buffer to HW //! Returns ID associated with the submission - uint submit(); + uint submit(bool forceFlush); bool flush(); @@ -401,15 +401,17 @@ public: //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void flushCUCaches() const + void flushCUCaches(bool flushL2 = false) const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; barrier.pPipePoints = &point; barrier.transitionCount = 1; - Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader, - {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; + uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; + Pal::BarrierTransition trans = { cacheMask, cacheMask, + { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 }, + Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; iCmd()->CmdBarrier(barrier); @@ -420,10 +422,17 @@ public: profileEvent(engId, Begin); } - void eventEnd(EngineType engId, GpuEvent& event) const { - const static bool End = false; - profileEvent(engId, End); - event.id = queues_[engId]->submit(); + void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const { + constexpr bool End = false; + if (forceExec) { + constexpr bool ForceFlush = true; + event.id = queues_[engId]->submit(ForceFlush); + profileEvent(engId, End); + } + else { + profileEvent(engId, End); + event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); + } event.engineId_ = engId; }