From a340e36035115ced77bc4bc2acb2fb18c261b9c7 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 23 Mar 2016 18:09:21 -0400
Subject: [PATCH] P4 to Git Change 1250684 by gandryey@gera-w8 on 2016/03/23
17:59:05
SWDEV-86035 - Add PAL backend to OpenCL
- Update PAL backend to match the latests PAL interfaces
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile.palbe#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#2 edit
---
rocclr/runtime/device/pal/palblit.cpp | 12 ++++-
rocclr/runtime/device/pal/paldevice.cpp | 67 +++++++++---------------
rocclr/runtime/device/pal/paldevice.hpp | 14 ++---
rocclr/runtime/device/pal/palkernel.cpp | 58 +++++++++++---------
rocclr/runtime/device/pal/palkernel.hpp | 2 +-
rocclr/runtime/device/pal/palprogram.cpp | 2 +-
rocclr/runtime/device/pal/palprogram.hpp | 5 ++
rocclr/runtime/device/pal/palvirtual.cpp | 9 ++--
rocclr/runtime/device/pal/palvirtual.hpp | 3 +-
9 files changed, 87 insertions(+), 85 deletions(-)
diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp
index 386926d714..ab5f713c8b 100644
--- a/rocclr/runtime/device/pal/palblit.cpp
+++ b/rocclr/runtime/device/pal/palblit.cpp
@@ -937,6 +937,8 @@ KernelBlitManager::copyBufferToImage(
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
static const bool FlushDMA = true;
+ size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize();
+ size_t imgSlicePitch = imgRowPitch * size[1];
if (setup_.disableCopyBufferToImage_) {
result = DmaBlitManager::copyBufferToImage(
@@ -947,7 +949,9 @@ KernelBlitManager::copyBufferToImage(
}
// Check if buffer is in system memory with direct access
else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
- (rowPitch == 0) && (slicePitch == 0)) {
+ (((rowPitch == 0) && (slicePitch == 0)) ||
+ ((rowPitch == imgRowPitch) &&
+ ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
// First attempt to do this all with DMA,
// but there are restriciton with older hardware
if (dev().settings().imageDMA_) {
@@ -1327,6 +1331,8 @@ KernelBlitManager::copyImageToBuffer(
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
static const bool FlushDMA = true;
+ size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize();
+ size_t imgSlicePitch = imgRowPitch * size[1];
if (setup_.disableCopyImageToBuffer_) {
result = HostBlitManager::copyImageToBuffer(
@@ -1337,7 +1343,9 @@ KernelBlitManager::copyImageToBuffer(
}
// Check if buffer is in system memory with direct access
else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
- (rowPitch == 0) && (slicePitch == 0)) {
+ (((rowPitch == 0) && (slicePitch == 0)) ||
+ ((rowPitch == imgRowPitch) &&
+ ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
// First attempt to do this all with DMA,
// but there are restriciton with older hardware
if (dev().settings().imageDMA_) {
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 0937ed086a..f8b754fda3 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -175,10 +175,10 @@ void NullDevice::fillDeviceInfo(
info_.maxWorkItemDimensions_ = 3;
info_.maxComputeUnits_ =
- palProp.gfxipProperties.engineCore.numOfShaderEngines *
- palProp.gfxipProperties.engineCore.numOfShaderArrays *
- palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray;
- info_.numberOfShaderEngines = palProp.gfxipProperties.engineCore.numOfShaderEngines;
+ palProp.gfxipProperties.shaderCore.numShaderEngines *
+ palProp.gfxipProperties.shaderCore.numShaderArrays *
+ palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
+ info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
// SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates.
// For example, float4 is not faster than float as long as all threads fetch the same
@@ -417,7 +417,7 @@ void NullDevice::fillDeviceInfo(
info_.simdPerCU_ = hwInfo()->simdPerCU_;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
- info_.wavefrontWidth_ = palProp.gfxipProperties.engineCore.wavefrontSize;
+ info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize;
//info_.globalMemChannels_ = calAttr.memBusWidth / 32;
//info_.globalMemChannelBanks_ = calAttr.numMemBanks;
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
@@ -1541,35 +1541,34 @@ Device::createView(amd::Memory& owner, const device::Memory& parent) const
//! Attempt to bind with external graphics API's device/context
bool
-Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+Device::bindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
{
assert(pDevice);
- switch (type) {
#ifdef _WIN32
- case CL_CONTEXT_D3D10_DEVICE_KHR:
+ if (flags & amd::Context::Flags::D3D10DeviceKhr) {
if (!associateD3D10Device(pDevice)) {
LogError("Failed gslD3D10Associate()");
return false;
}
- break;
- case CL_CONTEXT_D3D11_DEVICE_KHR:
+ }
+ else if (flags & amd::Context::Flags::D3D11DeviceKhr) {
if (!associateD3D11Device(pDevice)) {
LogError("Failed gslD3D11Associate()");
return false;
}
- break;
- case CL_CONTEXT_ADAPTER_D3D9_KHR:
- case CL_CONTEXT_ADAPTER_D3D9EX_KHR:
+ }
+ else if (flags & (amd::Context::Flags::D3D9DeviceKhr |
+ amd::Context::Flags::D3D9DeviceEXKhr)) {
if (!associateD3D9Device(pDevice)) {
LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
return false;
}
- break;
- case CL_CONTEXT_ADAPTER_DXVA_KHR:
- break;
+ }
+ else if (flags & amd::Context::Flags::D3D9DeviceVAKhr) {
+ }
#endif //_WIN32
- case CL_GL_CONTEXT_KHR:
+ if (flags & amd::Context::Flags::GLDeviceKhr) {
// Attempt to associate GSL-OGL
if (!glAssociate(pContext, pDevice)) {
if (!validateOnly) {
@@ -1577,20 +1576,15 @@ Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool va
}
return false;
}
- break;
- default:
- LogError("Unknown external device!");
- return false;
- break;
}
return true;
}
bool
-Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+Device::unbindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
{
- if (type != CL_GL_CONTEXT_KHR) {
+ if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) {
return true;
}
@@ -1820,8 +1814,8 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
// Calculate the size of the scratch buffer for a queue
uint32_t numTotalCUs = info().maxComputeUnits_;
uint32_t numMaxWaves =
- properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs;
- scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize *
+ properties().gfxipProperties.shaderCore.maxScratchWavesPerCu * numTotalCUs;
+ scratchBuf->size_ = properties().gfxipProperties.shaderCore.wavefrontSize *
scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF);
scratchBuf->offset_ = offset;
@@ -1920,8 +1914,7 @@ Device::fillHwSampler(
samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack;
- // Assign defaults
- samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase;
+ samplerInfo.filter.zFilter = Pal::XyFilterPoint;
samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask);
@@ -1956,24 +1949,16 @@ Device::fillHwSampler(
// Program texture filter mode
if (state == amd::Sampler::StateFilterLinear) {
- samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase;
+ samplerInfo.filter.magnification = Pal::XyFilterLinear;
+ samplerInfo.filter.minification = Pal::XyFilterLinear;
+ samplerInfo.filter.zFilter = Pal::ZFilterLinear;
}
if (mipFilter == CL_FILTER_NEAREST) {
- if (state == amd::Sampler::StateFilterLinear) {
- samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint;
- }
- else {
- samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint;
- }
+ samplerInfo.filter.mipFilter = Pal::MipFilterPoint;
}
else if (mipFilter == CL_FILTER_LINEAR) {
- if (state == amd::Sampler::StateFilterLinear) {
- samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear;
- }
- else {
- samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear;
- }
+ samplerInfo.filter.mipFilter = Pal::MipFilterLinear;
}
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp
index 8fe3347d46..7439682df1 100644
--- a/rocclr/runtime/device/pal/paldevice.hpp
+++ b/rocclr/runtime/device/pal/paldevice.hpp
@@ -91,10 +91,10 @@ public:
//! Needed for OpenGL objects on CPU device
virtual bool bindExternalDevice(
- intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+ uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }
virtual bool unbindExternalDevice(
- intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+ uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }
//! Releases non-blocking map target memory
virtual void freeMapTarget(amd::Memory& mem, void* target) {}
@@ -369,17 +369,11 @@ public:
//! Attempt to bind with external graphics API's device/context
virtual bool bindExternalDevice(
- intptr_t type,
- void* pDevice,
- void* pContext,
- bool validateOnly);
+ uint flags, void* pDevice, void* pContext, bool validateOnly);
//! Attempt to unbind with external graphics API's device/context
virtual bool unbindExternalDevice(
- intptr_t type,
- void* pDevice,
- void* pContext,
- bool validateOnly);
+ uint flags, void* pDevice, void* pContext, bool validateOnly);
//! Validates kernel before execution
virtual bool validateKernel(
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 268bb9eebc..3b2e7f5617 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -387,40 +387,49 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) {
return false;
}
- code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
- Resource::MemoryType type = Resource::RemoteUSWC;
- if (flags_.internalKernel_) {
- type = Resource::RemoteUSWC;
- }
- // Initialize kernel ISA code
- if (code_ && code_->create(type)) {
- address cpuCodePtr = static_cast(code_->map(nullptr, Resource::WriteOnly));
- // Copy only amd_kernel_code_t
- memcpy(cpuCodePtr, reinterpret_cast(akc), codeSize_);
- code_->unmap(nullptr);
- }
- else {
- LogError("Failed to allocate ISA code!");
- return false;
+ // Allocate HW resources for the real program only
+ if (!prog().isNull()) {
+ code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
+ Resource::MemoryType type = Resource::RemoteUSWC;
+ if (flags_.internalKernel_) {
+ type = Resource::RemoteUSWC;
+ }
+ // Initialize kernel ISA code
+ if (code_ && code_->create(type)) {
+ address cpuCodePtr = static_cast(code_->map(nullptr, Resource::WriteOnly));
+ // Copy only amd_kernel_code_t
+ memcpy(cpuCodePtr, reinterpret_cast(akc), codeSize_);
+ code_->unmap(nullptr);
+ }
+ else {
+ LogError("Failed to allocate ISA code!");
+ return false;
+ }
}
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
"Scratch must be DWORD aligned");
workGroupInfo_.scratchRegs_ =
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
-/*
- workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
- workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
- workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
- workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
-*/
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
workGroupInfo_.localMemSize_ =
workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
-
+
+ if (!prog().isNull()) {
+ workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
+ workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
+ workGroupInfo_.preferredSizeMultiple_ =
+ workGroupInfo_.wavefrontPerSIMD_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize;
+ }
+ else {
+ workGroupInfo_.availableSGPRs_ = 104;
+ workGroupInfo_.availableVGPRs_ = 256;
+ workGroupInfo_.preferredSizeMultiple_ =
+ workGroupInfo_.wavefrontPerSIMD_ = 64;
+ }
return true;
}
@@ -633,10 +642,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
}
}
- // Allocate HW resources for the real program only
- if (!prog().isNull()) {
- aqlCreateHWInfo(sym);
- }
+ aqlCreateHWInfo(sym);
// Pull out metadata from the ELF
size_t sizeOfArgList;
diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp
index f2b6c870b3..73d9dbddff 100644
--- a/rocclr/runtime/device/pal/palkernel.hpp
+++ b/rocclr/runtime/device/pal/palkernel.hpp
@@ -153,7 +153,7 @@ public:
{ return cpuAqlCode_->workgroup_group_segment_byte_size; }
//! Returns pointer on CPU to AQL code info
- const void* cpuAqlCode() const { return cpuAqlCode_; }
+ const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; }
//! Returns memory object with AQL code
pal::Memory* gpuAqlCode() const { return code_; }
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index 2384396b0e..383b170552 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -505,7 +505,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
hsa_agent_t agent;
agent.handle = 1;
if (!isNull() && hsaLoad) {
- executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
+ executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
if (executable_ == nullptr) {
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
return false;
diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp
index e4f72d7bf3..e78782d01e 100644
--- a/rocclr/runtime/device/pal/palprogram.hpp
+++ b/rocclr/runtime/device/pal/palprogram.hpp
@@ -55,6 +55,11 @@ public:
void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t offset) override;
+ void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment,
+ hsa_agent_t agent, void* seg, size_t offset) override {
+ return nullptr;
+ }
+
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t size) override { return false; }
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index ae642e1dc7..1cc00ef49f 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -43,6 +43,7 @@ VirtualGPU::Queue::Create(
Pal::QueueCreateInfo qCreateInfo = {};
qCreateInfo.engineType = queueType;
qCreateInfo.engineIndex = engineIdx;
+ qCreateInfo.aqlQueue = true;
// Find queue object size
size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
@@ -181,8 +182,10 @@ VirtualGPU::Queue::flush()
memRef.push_back(it->first);
}
}
+
if (memRef.size() != 0) {
- iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
+ iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_,
+ Pal::GpuMemoryRefCantTrim);
}
// Submit command buffer to OS
@@ -1982,12 +1985,12 @@ VirtualGPU::submitKernelInternal(
eventBegin(MainEngine);
if (nullptr == scratch) {
iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0,
- hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+ hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
}
else {
iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(),
scratch->size_, scratch->offset_,
- hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+ hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
}
eventEnd(MainEngine, gpuEvent);
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 1f7ca1307b..8183687345 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -69,7 +69,8 @@ public:
void addMemRef(Pal::IGpuMemory* iMem) const
{
- iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
+ iDev_->AddGpuMemoryReferences(1, &iMem, NULL,
+ Pal::GpuMemoryRefCantTrim);
}
void removeMemRef(Pal::IGpuMemory* iMem) const
{