P4 to Git Change 1288095 by jsjodin@jsjodin-git2p4-llvm on 2016/07/06 18:08:24
SWDEV-3 - IR: Set TargetPrefix for some X86 and AArch64 intrinsics where it was missing git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274390 91177308-0d34-0410-b5e6-96231b3b80d8 GitHash: f0a4c116041f7c2aef7796c8b067f0947b69602d Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/include/llvm/IR/IntrinsicsAArch64.td#2 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/include/llvm/IR/IntrinsicsX86.td#2 edit
Этот коммит содержится в:
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
|
||||
const void* data
|
||||
) const
|
||||
{
|
||||
static_cast<pal::Memory&>(memory).writeRawData(gpu(), 0, size, data, false);
|
||||
static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
|
||||
|
||||
synchronize();
|
||||
}
|
||||
|
||||
@@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device)
|
||||
// palSettings ...
|
||||
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
|
||||
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
|
||||
palSettings->cmdBufBatchedSubmitChainLimit = 0;
|
||||
|
||||
// Commit the new settings for the device
|
||||
result = iDev()->CommitSettingsAndInit();
|
||||
|
||||
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
|
||||
|
||||
// Copy offset and number of bytes available for printf data
|
||||
// into the corresponding location in the debug buffer
|
||||
dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
|
||||
dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1072,10 +1072,9 @@ Resource::free()
|
||||
void
|
||||
Resource::writeRawData(
|
||||
VirtualGPU& gpu,
|
||||
size_t offset,
|
||||
size_t size,
|
||||
size_t size,
|
||||
const void* data,
|
||||
bool waitForEvent) const
|
||||
bool waitForEvent) const
|
||||
{
|
||||
GpuEvent event;
|
||||
|
||||
@@ -1083,8 +1082,11 @@ Resource::writeRawData(
|
||||
// size needs to be DWORD aligned
|
||||
assert((size & 3) == 0);
|
||||
gpu.eventBegin(MainEngine);
|
||||
//! @todo Remove cache flush
|
||||
//! It's a workaround for a PAL crash with embedded data, allocated before any command
|
||||
gpu.flushCUCaches();
|
||||
gpu.queue(MainEngine).addCmdMemRef(iMem());
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.eventEnd(MainEngine, event);
|
||||
|
||||
setBusy(gpu, event);
|
||||
@@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
|
||||
uint dummy = 0;
|
||||
const bool NoWait = false;
|
||||
// Write 0 for the buffer paging by VidMM
|
||||
writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
|
||||
writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
|
||||
const bool Force = true;
|
||||
rename(gpu, Force);
|
||||
}
|
||||
|
||||
@@ -240,11 +240,10 @@ public:
|
||||
*
|
||||
*/
|
||||
void writeRawData(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t offset, //!< Offset for in the buffer for data
|
||||
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
|
||||
const void* data, //!< Data to be copied
|
||||
bool waitForEvent //!< Wait for event complete
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
|
||||
const void* data, //!< Data to be copied
|
||||
bool waitForEvent //!< Wait for event complete
|
||||
) const;
|
||||
|
||||
//! Returns the offset in GPU memory for aliases
|
||||
|
||||
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
|
||||
}
|
||||
|
||||
uint
|
||||
VirtualGPU::Queue::submit(bool forceFlush)
|
||||
VirtualGPU::Queue::submit()
|
||||
{
|
||||
cmdCnt_++;
|
||||
uint id = cmdBufIdCurrent_;
|
||||
if ((cmdCnt_ > MaxCommands) || forceFlush) {
|
||||
if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
|
||||
if (!flush()) {
|
||||
return GpuEvent::InvalidID;
|
||||
}
|
||||
@@ -238,11 +238,6 @@ VirtualGPU::Queue::flush()
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reset command buffer, so CB chunks could be reused
|
||||
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
|
||||
LogError("PAL failed CB reset!");
|
||||
return false;
|
||||
}
|
||||
// Start command buffer building
|
||||
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
|
||||
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
|
||||
@@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GPU_PRINT_CHILD_KERNEL != 0) {
|
||||
address ptr = reinterpret_cast<address>(
|
||||
virtualQueue_->map(this, Resource::WriteOnly));
|
||||
if (nullptr == ptr) {
|
||||
return false;
|
||||
}
|
||||
address ptr = reinterpret_cast<address>(
|
||||
virtualQueue_->map(this, Resource::WriteOnly));
|
||||
if (nullptr == ptr) {
|
||||
return false;
|
||||
}
|
||||
// Clear memory
|
||||
memset(ptr, 0, allocSize);
|
||||
uint64_t vaBase = virtualQueue_->vmAddress();
|
||||
AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
|
||||
|
||||
uint64_t vaBase = virtualQueue_->vmAddress();
|
||||
AmdVQueueHeader header = {};
|
||||
// Initialize the virtual queue header
|
||||
header.aql_slot_num = numSlots;
|
||||
header.event_slot_num = dev().settings().numDeviceEvents_;
|
||||
header.event_slot_mask = vaBase + eventMaskOffs;
|
||||
header.event_slots = vaBase + eventsOffs;
|
||||
header.aql_slot_mask = vaBase + slotMaskOffs;
|
||||
header.wait_size = dev().settings().numWaitEvents_;
|
||||
header.arg_size = dev().info().maxParameterSize_ + 64;
|
||||
header.mask_groups = maskGroups_;
|
||||
|
||||
header->aql_slot_num = numSlots;
|
||||
header->event_slot_num = dev().settings().numDeviceEvents_;
|
||||
header->event_slot_mask = vaBase + eventMaskOffs;
|
||||
header->event_slots = vaBase + eventsOffs;
|
||||
header->aql_slot_mask = vaBase + slotMaskOffs;
|
||||
header->wait_size = dev().settings().numWaitEvents_;
|
||||
header->arg_size = dev().info().maxParameterSize_ + 64;
|
||||
header->mask_groups = maskGroups_;
|
||||
vqHeader_ = new AmdVQueueHeader;
|
||||
if (nullptr == vqHeader_) {
|
||||
return false;
|
||||
}
|
||||
*vqHeader_ = header;
|
||||
|
||||
virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
|
||||
*vqHeader_ = *header;
|
||||
|
||||
// Go over all slots and perform initialization
|
||||
AmdAqlWrap slot = {};
|
||||
size_t offset = sizeof(AmdVQueueHeader);
|
||||
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
|
||||
for (uint i = 0; i < numSlots; ++i) {
|
||||
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
|
||||
slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
|
||||
slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
|
||||
virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
|
||||
offset += sizeof(AmdAqlWrap);
|
||||
slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
|
||||
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
|
||||
}
|
||||
// Upload data back to local memory
|
||||
if (GPU_PRINT_CHILD_KERNEL == 0) {
|
||||
virtualQueue_->unmap(this);
|
||||
}
|
||||
|
||||
schedParams_ = new Memory(dev(), 64 * Ki);
|
||||
@@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
return false;
|
||||
}
|
||||
|
||||
address ptr = reinterpret_cast<address>(schedParams_->map(this));
|
||||
ptr = reinterpret_cast<address>(schedParams_->map(this));
|
||||
|
||||
deviceQueueSize_ = deviceQueueSize;
|
||||
|
||||
@@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize)
|
||||
state_.profiling_ = profiling;
|
||||
|
||||
Pal::CmdAllocatorCreateInfo createInfo = {};
|
||||
createInfo.flags.threadSafe = true;
|
||||
// \todo forces PAL to reuse CBs, but requires postamble
|
||||
createInfo.flags.autoMemoryReuse = false;
|
||||
createInfo.flags.autoMemoryReuse = true;
|
||||
createInfo.flags.threadSafe = false;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
|
||||
Pal::GpuHeapGartCacheable;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
|
||||
@@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal(
|
||||
}
|
||||
|
||||
if (!dev().settings().useDeviceQueue_) {
|
||||
Unimplemented();
|
||||
/*
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
// 0, dev().settings().useDeviceQueue_);
|
||||
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
0, dev().settings().useDeviceQueue_);
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
}
|
||||
|
||||
// Get the global loop start before the scheduler
|
||||
//Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
//static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
|
||||
// *gpuDefQueue->virtualQueue_,
|
||||
// *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
|
||||
// gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
Unimplemented();
|
||||
/*
|
||||
mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
|
||||
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
|
||||
*gpuDefQueue->virtualQueue_,
|
||||
*gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
const static bool FlushL2 = true;
|
||||
gpuDefQueue->flushCUCaches(FlushL2);
|
||||
|
||||
// Get the address of PM4 template and add write it to params
|
||||
//! @note DMA flush must not occur between patch and the scheduler
|
||||
mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
|
||||
*/
|
||||
Pal::gpusize patchStart = 0;
|
||||
//Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
// Program parameters for the scheduler
|
||||
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
|
||||
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
|
||||
@@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal(
|
||||
|
||||
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
|
||||
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
|
||||
Unimplemented();
|
||||
/*
|
||||
gpuDefQueue->eventBegin(MainEngine);
|
||||
//gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
|
||||
// signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
|
||||
// (DeviceQueueMaskSize * maskGroups_));
|
||||
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
|
||||
// Thus TS command for profiling has to follow in the next CB.
|
||||
constexpr bool ForceSubmitFirst = true;
|
||||
gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
|
||||
|
||||
gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
|
||||
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
|
||||
signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
|
||||
(DeviceQueueMaskSize * maskGroups_));
|
||||
gpuDefQueue->eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
// Set GPU event for the used resources
|
||||
for (uint i = 0; i < memList.size(); ++i) {
|
||||
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
|
||||
}
|
||||
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
Unimplemented();
|
||||
/*
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
//iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
// vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
// signalAddr, dev().settings().useDeviceQueue_);
|
||||
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
signalAddr, dev().settings().useDeviceQueue_);
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
*/
|
||||
}
|
||||
|
||||
++gpuDefQueue->schedParamIdx_ %=
|
||||
@@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
|
||||
{
|
||||
const static bool Wait = true;
|
||||
vqHeader_->kernel_table = kernelTable;
|
||||
virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
|
||||
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -79,7 +79,7 @@ public:
|
||||
|
||||
//! Flushes the current command buffer to HW
|
||||
//! Returns ID associated with the submission
|
||||
uint submit(bool forceFlush);
|
||||
uint submit();
|
||||
|
||||
bool flush();
|
||||
|
||||
@@ -401,17 +401,15 @@ public:
|
||||
//! Returns queue, associated with VirtualGPU
|
||||
Queue& queue(EngineType id) const { return *queues_[id]; }
|
||||
|
||||
void flushCUCaches(bool flushL2 = false) const
|
||||
void flushCUCaches() const
|
||||
{
|
||||
Pal::BarrierInfo barrier = {};
|
||||
barrier.pipePointWaitCount = 1;
|
||||
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
||||
barrier.pPipePoints = &point;
|
||||
barrier.transitionCount = 1;
|
||||
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
|
||||
Pal::BarrierTransition trans = { cacheMask, cacheMask,
|
||||
{ nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
|
||||
Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
|
||||
{nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
barrier.pTransitions = &trans;
|
||||
barrier.waitPoint = Pal::HwPipePreCs;
|
||||
iCmd()->CmdBarrier(barrier);
|
||||
@@ -422,17 +420,10 @@ public:
|
||||
profileEvent(engId, Begin);
|
||||
}
|
||||
|
||||
void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
|
||||
constexpr bool End = false;
|
||||
if (forceExec) {
|
||||
constexpr bool ForceFlush = true;
|
||||
event.id = queues_[engId]->submit(ForceFlush);
|
||||
profileEvent(engId, End);
|
||||
}
|
||||
else {
|
||||
profileEvent(engId, End);
|
||||
event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
|
||||
}
|
||||
void eventEnd(EngineType engId, GpuEvent& event) const {
|
||||
const static bool End = false;
|
||||
profileEvent(engId, End);
|
||||
event.id = queues_[engId]->submit();
|
||||
event.engineId_ = engId;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user