From cf6dc7f1d439e9fed229e4344fb1f89fb8e408c2 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 7 Jul 2016 02:11:39 -0400
Subject: [PATCH] P4 to Git Change 1288218 by jsjodin@jsjodin-git2p4-clang on
2016/07/06 21:28:39
SWDEV-3 - [X86][SSE2] Updated tests to match llvm est\CodeGen\X86\sse2-intrinsics-fast-isel-x86_64.ll
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@274126 91177308-0d34-0410-b5e6-96231b3b80d8
GitHash: 5f57c65083ce901e984d6456f8a9f1d78b0f1e7f
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/clang.git/test/CodeGen/sse2-builtins.c#2 edit
[ROCm/clr commit: f9a2bb53b688831844ae0a332b1a1a6801b4eba9]
---
.../clr/rocclr/runtime/device/pal/palblit.cpp | 2 +-
.../rocclr/runtime/device/pal/paldevice.cpp | 1 +
.../rocclr/runtime/device/pal/palprintf.cpp | 2 +-
.../rocclr/runtime/device/pal/palresource.cpp | 12 +-
.../rocclr/runtime/device/pal/palresource.hpp | 9 +-
.../rocclr/runtime/device/pal/palvirtual.cpp | 123 +++++++++---------
.../rocclr/runtime/device/pal/palvirtual.hpp | 25 ++--
7 files changed, 91 insertions(+), 83 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index 5d5ef73471..66d02aba9d 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
const void* data
) const
{
- static_cast(memory).writeRawData(gpu(), size, data, false);
+ static_cast(memory).writeRawData(gpu(), 0, size, data, false);
synchronize();
}
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index ab38c8059c..608f676088 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -706,6 +706,7 @@ Device::create(Pal::IDevice* device)
// palSettings ...
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
+ palSettings->cmdBufBatchedSubmitChainLimit = 0;
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
index 40d902b377..1d81036668 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
// Copy offset and number of bytes available for printf data
// into the corresponding location in the debug buffer
- dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
+ dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
}
return true;
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index 54e56454b2..b6f47d0f0e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -1072,9 +1072,10 @@ Resource::free()
void
Resource::writeRawData(
VirtualGPU& gpu,
- size_t size,
+ size_t offset,
+ size_t size,
const void* data,
- bool waitForEvent) const
+ bool waitForEvent) const
{
GpuEvent event;
@@ -1082,11 +1083,8 @@ Resource::writeRawData(
// size needs to be DWORD aligned
assert((size & 3) == 0);
gpu.eventBegin(MainEngine);
- //! @todo Remove cache flush
- //! It's a workaround for a PAL crash with embedded data, allocated before any command
- gpu.flushCUCaches();
gpu.queue(MainEngine).addCmdMemRef(iMem());
- gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast(data));
+ gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data));
gpu.eventEnd(MainEngine, event);
setBusy(gpu, event);
@@ -1938,7 +1936,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
uint dummy = 0;
const bool NoWait = false;
// Write 0 for the buffer paging by VidMM
- writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
+ writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
const bool Force = true;
rename(gpu, Force);
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index b1fac5b20c..9687674c6b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -240,10 +240,11 @@ public:
*
*/
void writeRawData(
- VirtualGPU& gpu, //!< Virtual GPU device object
- size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
- const void* data, //!< Data to be copied
- bool waitForEvent //!< Wait for event complete
+ VirtualGPU& gpu, //!< Virtual GPU device object
+ size_t offset, //!< Offset for in the buffer for data
+ size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
+ const void* data, //!< Data to be copied
+ bool waitForEvent //!< Wait for event complete
) const;
//! Returns the offset in GPU memory for aliases
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 697ed8bb3c..e5beeac673 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
}
uint
-VirtualGPU::Queue::submit()
+VirtualGPU::Queue::submit(bool forceFlush)
{
cmdCnt_++;
uint id = cmdBufIdCurrent_;
- if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
+ if ((cmdCnt_ > MaxCommands) || forceFlush) {
if (!flush()) {
return GpuEvent::InvalidID;
}
@@ -238,6 +238,11 @@ VirtualGPU::Queue::flush()
return false;
}
+ // Reset command buffer, so CB chunks could be reused
+ if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
+ LogError("PAL failed CB reset!");
+ return false;
+ }
// Start command buffer building
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
@@ -591,41 +596,44 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
return false;
}
- address ptr = reinterpret_cast(
- virtualQueue_->map(this, Resource::WriteOnly));
- if (nullptr == ptr) {
- return false;
- }
- // Clear memory
- memset(ptr, 0, allocSize);
- uint64_t vaBase = virtualQueue_->vmAddress();
- AmdVQueueHeader* header = reinterpret_cast(ptr);
+ if (GPU_PRINT_CHILD_KERNEL != 0) {
+ address ptr = reinterpret_cast(
+ virtualQueue_->map(this, Resource::WriteOnly));
+ if (nullptr == ptr) {
+ return false;
+ }
+ }
+
+ uint64_t vaBase = virtualQueue_->vmAddress();
+ AmdVQueueHeader header = {};
// Initialize the virtual queue header
- header->aql_slot_num = numSlots;
- header->event_slot_num = dev().settings().numDeviceEvents_;
- header->event_slot_mask = vaBase + eventMaskOffs;
- header->event_slots = vaBase + eventsOffs;
- header->aql_slot_mask = vaBase + slotMaskOffs;
- header->wait_size = dev().settings().numWaitEvents_;
- header->arg_size = dev().info().maxParameterSize_ + 64;
- header->mask_groups = maskGroups_;
+ header.aql_slot_num = numSlots;
+ header.event_slot_num = dev().settings().numDeviceEvents_;
+ header.event_slot_mask = vaBase + eventMaskOffs;
+ header.event_slots = vaBase + eventsOffs;
+ header.aql_slot_mask = vaBase + slotMaskOffs;
+ header.wait_size = dev().settings().numWaitEvents_;
+ header.arg_size = dev().info().maxParameterSize_ + 64;
+ header.mask_groups = maskGroups_;
+
vqHeader_ = new AmdVQueueHeader;
if (nullptr == vqHeader_) {
return false;
}
- *vqHeader_ = *header;
+ *vqHeader_ = header;
+
+ virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
// Go over all slots and perform initialization
- AmdAqlWrap* slots = reinterpret_cast(&header[1]);
+ AmdAqlWrap slot = {};
+ size_t offset = sizeof(AmdVQueueHeader);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
- slots[i].aql.kernarg_address = reinterpret_cast(argStart);
- slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
- }
- // Upload data back to local memory
- if (GPU_PRINT_CHILD_KERNEL == 0) {
- virtualQueue_->unmap(this);
+ slot.aql.kernarg_address = reinterpret_cast(argStart);
+ slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
+ virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
+ offset += sizeof(AmdAqlWrap);
}
schedParams_ = new Memory(dev(), 64 * Ki);
@@ -633,7 +641,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
return false;
}
- ptr = reinterpret_cast(schedParams_->map(this));
+ address ptr = reinterpret_cast(schedParams_->map(this));
deviceQueueSize_ = deviceQueueSize;
@@ -689,9 +697,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize)
state_.profiling_ = profiling;
Pal::CmdAllocatorCreateInfo createInfo = {};
+ createInfo.flags.threadSafe = true;
// \todo forces PAL to reuse CBs, but requires postamble
- createInfo.flags.autoMemoryReuse = true;
- createInfo.flags.threadSafe = false;
+ createInfo.flags.autoMemoryReuse = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
Pal::GpuHeapGartCacheable;
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
@@ -2100,34 +2108,28 @@ VirtualGPU::submitKernelInternal(
}
if (!dev().settings().useDeviceQueue_) {
- Unimplemented();
-/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
- cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
- vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- 0, dev().settings().useDeviceQueue_);
+ //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
+ // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ // vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ // 0, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
-*/
}
// Get the global loop start before the scheduler
- Unimplemented();
-/*
- mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
- static_cast(gpuDefQueue->blitMgr()).runScheduler(
- *gpuDefQueue->virtualQueue_,
- *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
- gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+ //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
+ //static_cast(gpuDefQueue->blitMgr()).runScheduler(
+ // *gpuDefQueue->virtualQueue_,
+ // *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+ // gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->flushCUCaches(FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
- mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
-*/
Pal::gpusize patchStart = 0;
+ //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
SchedulerParam* param = &reinterpret_cast
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -2168,31 +2170,28 @@ VirtualGPU::submitKernelInternal(
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
- Unimplemented();
-/*
gpuDefQueue->eventBegin(MainEngine);
- gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
- gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
- signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
- (DeviceQueueMaskSize * maskGroups_));
- gpuDefQueue->eventEnd(MainEngine, gpuEvent);
-*/
+ //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
+ // signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
+ // (DeviceQueueMaskSize * maskGroups_));
+ // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
+ // Thus TS command for profiling has to follow in the next CB.
+ constexpr bool ForceSubmitFirst = true;
+ gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
+
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
}
if (dev().settings().useDeviceQueue_) {
- Unimplemented();
-/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
- cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
- vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- signalAddr, dev().settings().useDeviceQueue_);
+ //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
+ // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+ // vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+ // signalAddr, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
-*/
}
++gpuDefQueue->schedParamIdx_ %=
@@ -3250,7 +3249,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
{
const static bool Wait = true;
vqHeader_->kernel_table = kernelTable;
- virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
+ virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
}
void
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index c566a3f646..5f9a73d303 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -79,7 +79,7 @@ public:
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
- uint submit();
+ uint submit(bool forceFlush);
bool flush();
@@ -401,15 +401,17 @@ public:
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
- void flushCUCaches() const
+ void flushCUCaches(bool flushL2 = false) const
{
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
barrier.pPipePoints = &point;
barrier.transitionCount = 1;
- Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
- {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+ uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
+ Pal::BarrierTransition trans = { cacheMask, cacheMask,
+ { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
+ Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
iCmd()->CmdBarrier(barrier);
@@ -420,10 +422,17 @@ public:
profileEvent(engId, Begin);
}
- void eventEnd(EngineType engId, GpuEvent& event) const {
- const static bool End = false;
- profileEvent(engId, End);
- event.id = queues_[engId]->submit();
+ void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
+ constexpr bool End = false;
+ if (forceExec) {
+ constexpr bool ForceFlush = true;
+ event.id = queues_[engId]->submit(ForceFlush);
+ profileEvent(engId, End);
+ }
+ else {
+ profileEvent(engId, End);
+ event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
+ }
event.engineId_ = engId;
}