P4 to Git Change 1288113 by jsjodin@jsjodin-git2p4-llvm on 2016/07/06 18:23:12
SWDEV-3 - AMDGPU: Expand unaligned accesses early
Due to visit order problems, in the case of an unaligned copy
the legalized DAG fails to eliminate extra instructions introduced
by the expansion of both unaligned parts.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274397 91177308-0d34-0410-b5e6-96231b3b80d8
GitHash: d4452f8fcf496a2e19c1a1c9792f5f063f4e9703
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.h#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll#1 add
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/unaligned-load-store.ll#2 edit
[ROCm/clr commit: 739bdacc65]
이 커밋은 다음에 포함됨:
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
|
||||
const void* data
|
||||
) const
|
||||
{
|
||||
static_cast<pal::Memory&>(memory).writeRawData(gpu(), 0, size, data, false);
|
||||
static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
|
||||
|
||||
synchronize();
|
||||
}
|
||||
|
||||
@@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device)
|
||||
// palSettings ...
|
||||
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
|
||||
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
|
||||
palSettings->cmdBufBatchedSubmitChainLimit = 0;
|
||||
|
||||
// Commit the new settings for the device
|
||||
result = iDev()->CommitSettingsAndInit();
|
||||
|
||||
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
|
||||
|
||||
// Copy offset and number of bytes available for printf data
|
||||
// into the corresponding location in the debug buffer
|
||||
dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
|
||||
dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1072,10 +1072,9 @@ Resource::free()
|
||||
void
|
||||
Resource::writeRawData(
|
||||
VirtualGPU& gpu,
|
||||
size_t offset,
|
||||
size_t size,
|
||||
size_t size,
|
||||
const void* data,
|
||||
bool waitForEvent) const
|
||||
bool waitForEvent) const
|
||||
{
|
||||
GpuEvent event;
|
||||
|
||||
@@ -1083,8 +1082,11 @@ Resource::writeRawData(
|
||||
// size needs to be DWORD aligned
|
||||
assert((size & 3) == 0);
|
||||
gpu.eventBegin(MainEngine);
|
||||
//! @todo Remove cache flush
|
||||
//! It's a workaround for a PAL crash with embedded data, allocated before any command
|
||||
gpu.flushCUCaches();
|
||||
gpu.queue(MainEngine).addCmdMemRef(iMem());
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.eventEnd(MainEngine, event);
|
||||
|
||||
setBusy(gpu, event);
|
||||
@@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
|
||||
uint dummy = 0;
|
||||
const bool NoWait = false;
|
||||
// Write 0 for the buffer paging by VidMM
|
||||
writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
|
||||
writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
|
||||
const bool Force = true;
|
||||
rename(gpu, Force);
|
||||
}
|
||||
|
||||
@@ -240,11 +240,10 @@ public:
|
||||
*
|
||||
*/
|
||||
void writeRawData(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t offset, //!< Offset for in the buffer for data
|
||||
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
|
||||
const void* data, //!< Data to be copied
|
||||
bool waitForEvent //!< Wait for event complete
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
|
||||
const void* data, //!< Data to be copied
|
||||
bool waitForEvent //!< Wait for event complete
|
||||
) const;
|
||||
|
||||
//! Returns the offset in GPU memory for aliases
|
||||
|
||||
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
|
||||
}
|
||||
|
||||
uint
|
||||
VirtualGPU::Queue::submit(bool forceFlush)
|
||||
VirtualGPU::Queue::submit()
|
||||
{
|
||||
cmdCnt_++;
|
||||
uint id = cmdBufIdCurrent_;
|
||||
if ((cmdCnt_ > MaxCommands) || forceFlush) {
|
||||
if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
|
||||
if (!flush()) {
|
||||
return GpuEvent::InvalidID;
|
||||
}
|
||||
@@ -238,11 +238,6 @@ VirtualGPU::Queue::flush()
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reset command buffer, so CB chunks could be reused
|
||||
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
|
||||
LogError("PAL failed CB reset!");
|
||||
return false;
|
||||
}
|
||||
// Start command buffer building
|
||||
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
|
||||
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
|
||||
@@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GPU_PRINT_CHILD_KERNEL != 0) {
|
||||
address ptr = reinterpret_cast<address>(
|
||||
virtualQueue_->map(this, Resource::WriteOnly));
|
||||
if (nullptr == ptr) {
|
||||
return false;
|
||||
}
|
||||
address ptr = reinterpret_cast<address>(
|
||||
virtualQueue_->map(this, Resource::WriteOnly));
|
||||
if (nullptr == ptr) {
|
||||
return false;
|
||||
}
|
||||
// Clear memory
|
||||
memset(ptr, 0, allocSize);
|
||||
uint64_t vaBase = virtualQueue_->vmAddress();
|
||||
AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
|
||||
|
||||
uint64_t vaBase = virtualQueue_->vmAddress();
|
||||
AmdVQueueHeader header = {};
|
||||
// Initialize the virtual queue header
|
||||
header.aql_slot_num = numSlots;
|
||||
header.event_slot_num = dev().settings().numDeviceEvents_;
|
||||
header.event_slot_mask = vaBase + eventMaskOffs;
|
||||
header.event_slots = vaBase + eventsOffs;
|
||||
header.aql_slot_mask = vaBase + slotMaskOffs;
|
||||
header.wait_size = dev().settings().numWaitEvents_;
|
||||
header.arg_size = dev().info().maxParameterSize_ + 64;
|
||||
header.mask_groups = maskGroups_;
|
||||
|
||||
header->aql_slot_num = numSlots;
|
||||
header->event_slot_num = dev().settings().numDeviceEvents_;
|
||||
header->event_slot_mask = vaBase + eventMaskOffs;
|
||||
header->event_slots = vaBase + eventsOffs;
|
||||
header->aql_slot_mask = vaBase + slotMaskOffs;
|
||||
header->wait_size = dev().settings().numWaitEvents_;
|
||||
header->arg_size = dev().info().maxParameterSize_ + 64;
|
||||
header->mask_groups = maskGroups_;
|
||||
vqHeader_ = new AmdVQueueHeader;
|
||||
if (nullptr == vqHeader_) {
|
||||
return false;
|
||||
}
|
||||
*vqHeader_ = header;
|
||||
|
||||
virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
|
||||
*vqHeader_ = *header;
|
||||
|
||||
// Go over all slots and perform initialization
|
||||
AmdAqlWrap slot = {};
|
||||
size_t offset = sizeof(AmdVQueueHeader);
|
||||
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
|
||||
for (uint i = 0; i < numSlots; ++i) {
|
||||
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
|
||||
slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
|
||||
slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
|
||||
virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
|
||||
offset += sizeof(AmdAqlWrap);
|
||||
slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
|
||||
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
|
||||
}
|
||||
// Upload data back to local memory
|
||||
if (GPU_PRINT_CHILD_KERNEL == 0) {
|
||||
virtualQueue_->unmap(this);
|
||||
}
|
||||
|
||||
schedParams_ = new Memory(dev(), 64 * Ki);
|
||||
@@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
return false;
|
||||
}
|
||||
|
||||
address ptr = reinterpret_cast<address>(schedParams_->map(this));
|
||||
ptr = reinterpret_cast<address>(schedParams_->map(this));
|
||||
|
||||
deviceQueueSize_ = deviceQueueSize;
|
||||
|
||||
@@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize)
|
||||
state_.profiling_ = profiling;
|
||||
|
||||
Pal::CmdAllocatorCreateInfo createInfo = {};
|
||||
createInfo.flags.threadSafe = true;
|
||||
// \todo forces PAL to reuse CBs, but requires postamble
|
||||
createInfo.flags.autoMemoryReuse = false;
|
||||
createInfo.flags.autoMemoryReuse = true;
|
||||
createInfo.flags.threadSafe = false;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
|
||||
Pal::GpuHeapGartCacheable;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
|
||||
@@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal(
|
||||
}
|
||||
|
||||
if (!dev().settings().useDeviceQueue_) {
|
||||
Unimplemented();
|
||||
/*
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
// 0, dev().settings().useDeviceQueue_);
|
||||
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
0, dev().settings().useDeviceQueue_);
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
}
|
||||
|
||||
// Get the global loop start before the scheduler
|
||||
//Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
//static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
|
||||
// *gpuDefQueue->virtualQueue_,
|
||||
// *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
|
||||
// gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
Unimplemented();
|
||||
/*
|
||||
mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
|
||||
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
|
||||
*gpuDefQueue->virtualQueue_,
|
||||
*gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
const static bool FlushL2 = true;
|
||||
gpuDefQueue->flushCUCaches(FlushL2);
|
||||
|
||||
// Get the address of PM4 template and add write it to params
|
||||
//! @note DMA flush must not occur between patch and the scheduler
|
||||
mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
|
||||
*/
|
||||
Pal::gpusize patchStart = 0;
|
||||
//Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
// Program parameters for the scheduler
|
||||
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
|
||||
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
|
||||
@@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal(
|
||||
|
||||
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
|
||||
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
|
||||
Unimplemented();
|
||||
/*
|
||||
gpuDefQueue->eventBegin(MainEngine);
|
||||
//gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
|
||||
// signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
|
||||
// (DeviceQueueMaskSize * maskGroups_));
|
||||
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
|
||||
// Thus TS command for profiling has to follow in the next CB.
|
||||
constexpr bool ForceSubmitFirst = true;
|
||||
gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
|
||||
|
||||
gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
|
||||
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
|
||||
signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
|
||||
(DeviceQueueMaskSize * maskGroups_));
|
||||
gpuDefQueue->eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
// Set GPU event for the used resources
|
||||
for (uint i = 0; i < memList.size(); ++i) {
|
||||
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
|
||||
}
|
||||
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
Unimplemented();
|
||||
/*
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
// signalAddr, dev().settings().useDeviceQueue_);
|
||||
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
signalAddr, dev().settings().useDeviceQueue_);
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
}
|
||||
|
||||
++gpuDefQueue->schedParamIdx_ %=
|
||||
@@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
|
||||
{
|
||||
const static bool Wait = true;
|
||||
vqHeader_->kernel_table = kernelTable;
|
||||
virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
|
||||
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -79,7 +79,7 @@ public:
|
||||
|
||||
//! Flushes the current command buffer to HW
|
||||
//! Returns ID associated with the submission
|
||||
uint submit(bool forceFlush);
|
||||
uint submit();
|
||||
|
||||
bool flush();
|
||||
|
||||
@@ -401,17 +401,15 @@ public:
|
||||
//! Returns queue, associated with VirtualGPU
|
||||
Queue& queue(EngineType id) const { return *queues_[id]; }
|
||||
|
||||
void flushCUCaches(bool flushL2 = false) const
|
||||
void flushCUCaches() const
|
||||
{
|
||||
Pal::BarrierInfo barrier = {};
|
||||
barrier.pipePointWaitCount = 1;
|
||||
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
||||
barrier.pPipePoints = &point;
|
||||
barrier.transitionCount = 1;
|
||||
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
|
||||
Pal::BarrierTransition trans = { cacheMask, cacheMask,
|
||||
{ nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
|
||||
Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
|
||||
{nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
barrier.pTransitions = &trans;
|
||||
barrier.waitPoint = Pal::HwPipePreCs;
|
||||
iCmd()->CmdBarrier(barrier);
|
||||
@@ -422,17 +420,10 @@ public:
|
||||
profileEvent(engId, Begin);
|
||||
}
|
||||
|
||||
void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
|
||||
constexpr bool End = false;
|
||||
if (forceExec) {
|
||||
constexpr bool ForceFlush = true;
|
||||
event.id = queues_[engId]->submit(ForceFlush);
|
||||
profileEvent(engId, End);
|
||||
}
|
||||
else {
|
||||
profileEvent(engId, End);
|
||||
event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
|
||||
}
|
||||
void eventEnd(EngineType engId, GpuEvent& event) const {
|
||||
const static bool End = false;
|
||||
profileEvent(engId, End);
|
||||
event.id = queues_[engId]->submit();
|
||||
event.engineId_ = engId;
|
||||
}
|
||||
|
||||
|
||||
새 이슈에서 참조
사용자 차단