P4 to Git Change 1288113 by jsjodin@jsjodin-git2p4-llvm on 2016/07/06 18:23:12

SWDEV-3 - AMDGPU: Expand unaligned accesses early

	Due to visit order problems, in the case of an unaligned copy
	the legalized DAG fails to eliminate extra instructions introduced
	by the expansion of both unaligned parts.

	git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274397 91177308-0d34-0410-b5e6-96231b3b80d8

	GitHash: d4452f8fcf496a2e19c1a1c9792f5f063f4e9703

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.h#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll#1 add
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/unaligned-load-store.ll#2 edit


[ROCm/clr commit: 739bdacc65]
이 커밋은 다음에 포함됨:
foreman
2016-07-07 03:52:33 -04:00
부모 10baecf582
커밋 13b474485b
7개의 변경된 파일82개의 추가작업 그리고 90개의 파일을 삭제
+1 -1
파일 보기
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
const void* data
) const
{
static_cast<pal::Memory&>(memory).writeRawData(gpu(), 0, size, data, false);
static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
synchronize();
}
-1
파일 보기
@@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device)
// palSettings ...
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
palSettings->cmdBufBatchedSubmitChainLimit = 0;
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
+1 -1
파일 보기
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
// Copy offset and number of bytes available for printf data
// into the corresponding location in the debug buffer
dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
}
return true;
}
+7 -5
파일 보기
@@ -1072,10 +1072,9 @@ Resource::free()
void
Resource::writeRawData(
VirtualGPU& gpu,
size_t offset,
size_t size,
size_t size,
const void* data,
bool waitForEvent) const
bool waitForEvent) const
{
GpuEvent event;
@@ -1083,8 +1082,11 @@ Resource::writeRawData(
// size needs to be DWORD aligned
assert((size & 3) == 0);
gpu.eventBegin(MainEngine);
//! @todo Remove cache flush
//! It's a workaround for a PAL crash with embedded data, allocated before any command
gpu.flushCUCaches();
gpu.queue(MainEngine).addCmdMemRef(iMem());
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
gpu.eventEnd(MainEngine, event);
setBusy(gpu, event);
@@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
uint dummy = 0;
const bool NoWait = false;
// Write 0 for the buffer paging by VidMM
writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
const bool Force = true;
rename(gpu, Force);
}
+4 -5
파일 보기
@@ -240,11 +240,10 @@ public:
*
*/
void writeRawData(
VirtualGPU& gpu, //!< Virtual GPU device object
size_t offset, //!< Offset for in the buffer for data
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
const void* data, //!< Data to be copied
bool waitForEvent //!< Wait for event complete
VirtualGPU& gpu, //!< Virtual GPU device object
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
const void* data, //!< Data to be copied
bool waitForEvent //!< Wait for event complete
) const;
//! Returns the offset in GPU memory for aliases
+61 -60
파일 보기
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
}
uint
VirtualGPU::Queue::submit(bool forceFlush)
VirtualGPU::Queue::submit()
{
cmdCnt_++;
uint id = cmdBufIdCurrent_;
if ((cmdCnt_ > MaxCommands) || forceFlush) {
if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
if (!flush()) {
return GpuEvent::InvalidID;
}
@@ -238,11 +238,6 @@ VirtualGPU::Queue::flush()
return false;
}
// Reset command buffer, so CB chunks could be reused
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
LogError("PAL failed CB reset!");
return false;
}
// Start command buffer building
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
@@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
return false;
}
if (GPU_PRINT_CHILD_KERNEL != 0) {
address ptr = reinterpret_cast<address>(
virtualQueue_->map(this, Resource::WriteOnly));
if (nullptr == ptr) {
return false;
}
address ptr = reinterpret_cast<address>(
virtualQueue_->map(this, Resource::WriteOnly));
if (nullptr == ptr) {
return false;
}
// Clear memory
memset(ptr, 0, allocSize);
uint64_t vaBase = virtualQueue_->vmAddress();
AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
uint64_t vaBase = virtualQueue_->vmAddress();
AmdVQueueHeader header = {};
// Initialize the virtual queue header
header.aql_slot_num = numSlots;
header.event_slot_num = dev().settings().numDeviceEvents_;
header.event_slot_mask = vaBase + eventMaskOffs;
header.event_slots = vaBase + eventsOffs;
header.aql_slot_mask = vaBase + slotMaskOffs;
header.wait_size = dev().settings().numWaitEvents_;
header.arg_size = dev().info().maxParameterSize_ + 64;
header.mask_groups = maskGroups_;
header->aql_slot_num = numSlots;
header->event_slot_num = dev().settings().numDeviceEvents_;
header->event_slot_mask = vaBase + eventMaskOffs;
header->event_slots = vaBase + eventsOffs;
header->aql_slot_mask = vaBase + slotMaskOffs;
header->wait_size = dev().settings().numWaitEvents_;
header->arg_size = dev().info().maxParameterSize_ + 64;
header->mask_groups = maskGroups_;
vqHeader_ = new AmdVQueueHeader;
if (nullptr == vqHeader_) {
return false;
}
*vqHeader_ = header;
virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
*vqHeader_ = *header;
// Go over all slots and perform initialization
AmdAqlWrap slot = {};
size_t offset = sizeof(AmdVQueueHeader);
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
offset += sizeof(AmdAqlWrap);
slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
}
// Upload data back to local memory
if (GPU_PRINT_CHILD_KERNEL == 0) {
virtualQueue_->unmap(this);
}
schedParams_ = new Memory(dev(), 64 * Ki);
@@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
return false;
}
address ptr = reinterpret_cast<address>(schedParams_->map(this));
ptr = reinterpret_cast<address>(schedParams_->map(this));
deviceQueueSize_ = deviceQueueSize;
@@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize)
state_.profiling_ = profiling;
Pal::CmdAllocatorCreateInfo createInfo = {};
createInfo.flags.threadSafe = true;
// \todo forces PAL to reuse CBs, but requires postamble
createInfo.flags.autoMemoryReuse = false;
createInfo.flags.autoMemoryReuse = true;
createInfo.flags.threadSafe = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
Pal::GpuHeapGartCacheable;
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
@@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal(
}
if (!dev().settings().useDeviceQueue_) {
Unimplemented();
/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
// 0, dev().settings().useDeviceQueue_);
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
0, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
*/
}
// Get the global loop start before the scheduler
//Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
//static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
// *gpuDefQueue->virtualQueue_,
// *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
// gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
Unimplemented();
/*
mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
*gpuDefQueue->virtualQueue_,
*gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->flushCUCaches(FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
*/
Pal::gpusize patchStart = 0;
//Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal(
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
Unimplemented();
/*
gpuDefQueue->eventBegin(MainEngine);
//gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
// signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
// (DeviceQueueMaskSize * maskGroups_));
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
// Thus TS command for profiling has to follow in the next CB.
constexpr bool ForceSubmitFirst = true;
gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
(DeviceQueueMaskSize * maskGroups_));
gpuDefQueue->eventEnd(MainEngine, gpuEvent);
*/
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
}
if (dev().settings().useDeviceQueue_) {
Unimplemented();
/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
// signalAddr, dev().settings().useDeviceQueue_);
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
*/
}
++gpuDefQueue->schedParamIdx_ %=
@@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
{
const static bool Wait = true;
vqHeader_->kernel_table = kernelTable;
virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
}
void
+8 -17
파일 보기
@@ -79,7 +79,7 @@ public:
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
uint submit(bool forceFlush);
uint submit();
bool flush();
@@ -401,17 +401,15 @@ public:
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
void flushCUCaches(bool flushL2 = false) const
void flushCUCaches() const
{
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
barrier.pPipePoints = &point;
barrier.transitionCount = 1;
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
Pal::BarrierTransition trans = { cacheMask, cacheMask,
{ nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
{nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
iCmd()->CmdBarrier(barrier);
@@ -422,17 +420,10 @@ public:
profileEvent(engId, Begin);
}
void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
constexpr bool End = false;
if (forceExec) {
constexpr bool ForceFlush = true;
event.id = queues_[engId]->submit(ForceFlush);
profileEvent(engId, End);
}
else {
profileEvent(engId, End);
event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
}
void eventEnd(EngineType engId, GpuEvent& event) const {
const static bool End = false;
profileEvent(engId, End);
event.id = queues_[engId]->submit();
event.engineId_ = engId;
}