P4 to Git Change 1250684 by gandryey@gera-w8 on 2016/03/23 17:59:05
SWDEV-86035 - Add PAL backend to OpenCL - Update PAL backend to match the latests PAL interfaces Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile.palbe#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#2 edit
This commit is contained in:
@@ -937,6 +937,8 @@ KernelBlitManager::copyBufferToImage(
|
||||
static const bool CopyRect = false;
|
||||
// Flush DMA for ASYNC copy
|
||||
static const bool FlushDMA = true;
|
||||
size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize();
|
||||
size_t imgSlicePitch = imgRowPitch * size[1];
|
||||
|
||||
if (setup_.disableCopyBufferToImage_) {
|
||||
result = DmaBlitManager::copyBufferToImage(
|
||||
@@ -947,7 +949,9 @@ KernelBlitManager::copyBufferToImage(
|
||||
}
|
||||
// Check if buffer is in system memory with direct access
|
||||
else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
|
||||
(rowPitch == 0) && (slicePitch == 0)) {
|
||||
(((rowPitch == 0) && (slicePitch == 0)) ||
|
||||
((rowPitch == imgRowPitch) &&
|
||||
((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
|
||||
// First attempt to do this all with DMA,
|
||||
// but there are restriciton with older hardware
|
||||
if (dev().settings().imageDMA_) {
|
||||
@@ -1327,6 +1331,8 @@ KernelBlitManager::copyImageToBuffer(
|
||||
static const bool CopyRect = false;
|
||||
// Flush DMA for ASYNC copy
|
||||
static const bool FlushDMA = true;
|
||||
size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize();
|
||||
size_t imgSlicePitch = imgRowPitch * size[1];
|
||||
|
||||
if (setup_.disableCopyImageToBuffer_) {
|
||||
result = HostBlitManager::copyImageToBuffer(
|
||||
@@ -1337,7 +1343,9 @@ KernelBlitManager::copyImageToBuffer(
|
||||
}
|
||||
// Check if buffer is in system memory with direct access
|
||||
else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
|
||||
(rowPitch == 0) && (slicePitch == 0)) {
|
||||
(((rowPitch == 0) && (slicePitch == 0)) ||
|
||||
((rowPitch == imgRowPitch) &&
|
||||
((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
|
||||
// First attempt to do this all with DMA,
|
||||
// but there are restriciton with older hardware
|
||||
if (dev().settings().imageDMA_) {
|
||||
|
||||
@@ -175,10 +175,10 @@ void NullDevice::fillDeviceInfo(
|
||||
|
||||
info_.maxWorkItemDimensions_ = 3;
|
||||
info_.maxComputeUnits_ =
|
||||
palProp.gfxipProperties.engineCore.numOfShaderEngines *
|
||||
palProp.gfxipProperties.engineCore.numOfShaderArrays *
|
||||
palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray;
|
||||
info_.numberOfShaderEngines = palProp.gfxipProperties.engineCore.numOfShaderEngines;
|
||||
palProp.gfxipProperties.shaderCore.numShaderEngines *
|
||||
palProp.gfxipProperties.shaderCore.numShaderArrays *
|
||||
palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
|
||||
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
|
||||
|
||||
// SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates.
|
||||
// For example, float4 is not faster than float as long as all threads fetch the same
|
||||
@@ -417,7 +417,7 @@ void NullDevice::fillDeviceInfo(
|
||||
info_.simdPerCU_ = hwInfo()->simdPerCU_;
|
||||
info_.simdWidth_ = hwInfo()->simdWidth_;
|
||||
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
|
||||
info_.wavefrontWidth_ = palProp.gfxipProperties.engineCore.wavefrontSize;
|
||||
info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize;
|
||||
//info_.globalMemChannels_ = calAttr.memBusWidth / 32;
|
||||
//info_.globalMemChannelBanks_ = calAttr.numMemBanks;
|
||||
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
|
||||
@@ -1541,35 +1541,34 @@ Device::createView(amd::Memory& owner, const device::Memory& parent) const
|
||||
|
||||
//! Attempt to bind with external graphics API's device/context
|
||||
bool
|
||||
Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
|
||||
Device::bindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
|
||||
{
|
||||
assert(pDevice);
|
||||
|
||||
switch (type) {
|
||||
#ifdef _WIN32
|
||||
case CL_CONTEXT_D3D10_DEVICE_KHR:
|
||||
if (flags & amd::Context::Flags::D3D10DeviceKhr) {
|
||||
if (!associateD3D10Device(pDevice)) {
|
||||
LogError("Failed gslD3D10Associate()");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case CL_CONTEXT_D3D11_DEVICE_KHR:
|
||||
}
|
||||
else if (flags & amd::Context::Flags::D3D11DeviceKhr) {
|
||||
if (!associateD3D11Device(pDevice)) {
|
||||
LogError("Failed gslD3D11Associate()");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case CL_CONTEXT_ADAPTER_D3D9_KHR:
|
||||
case CL_CONTEXT_ADAPTER_D3D9EX_KHR:
|
||||
}
|
||||
else if (flags & (amd::Context::Flags::D3D9DeviceKhr |
|
||||
amd::Context::Flags::D3D9DeviceEXKhr)) {
|
||||
if (!associateD3D9Device(pDevice)) {
|
||||
LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case CL_CONTEXT_ADAPTER_DXVA_KHR:
|
||||
break;
|
||||
}
|
||||
else if (flags & amd::Context::Flags::D3D9DeviceVAKhr) {
|
||||
}
|
||||
#endif //_WIN32
|
||||
case CL_GL_CONTEXT_KHR:
|
||||
if (flags & amd::Context::Flags::GLDeviceKhr) {
|
||||
// Attempt to associate GSL-OGL
|
||||
if (!glAssociate(pContext, pDevice)) {
|
||||
if (!validateOnly) {
|
||||
@@ -1577,20 +1576,15 @@ Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool va
|
||||
}
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown external device!");
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
|
||||
Device::unbindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
|
||||
{
|
||||
if (type != CL_GL_CONTEXT_KHR) {
|
||||
if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1820,8 +1814,8 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
|
||||
// Calculate the size of the scratch buffer for a queue
|
||||
uint32_t numTotalCUs = info().maxComputeUnits_;
|
||||
uint32_t numMaxWaves =
|
||||
properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs;
|
||||
scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize *
|
||||
properties().gfxipProperties.shaderCore.maxScratchWavesPerCu * numTotalCUs;
|
||||
scratchBuf->size_ = properties().gfxipProperties.shaderCore.wavefrontSize *
|
||||
scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
|
||||
scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF);
|
||||
scratchBuf->offset_ = offset;
|
||||
@@ -1920,8 +1914,7 @@ Device::fillHwSampler(
|
||||
|
||||
samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack;
|
||||
|
||||
// Assign defaults
|
||||
samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase;
|
||||
samplerInfo.filter.zFilter = Pal::XyFilterPoint;
|
||||
|
||||
samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask);
|
||||
|
||||
@@ -1956,24 +1949,16 @@ Device::fillHwSampler(
|
||||
|
||||
// Program texture filter mode
|
||||
if (state == amd::Sampler::StateFilterLinear) {
|
||||
samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase;
|
||||
samplerInfo.filter.magnification = Pal::XyFilterLinear;
|
||||
samplerInfo.filter.minification = Pal::XyFilterLinear;
|
||||
samplerInfo.filter.zFilter = Pal::ZFilterLinear;
|
||||
}
|
||||
|
||||
if (mipFilter == CL_FILTER_NEAREST) {
|
||||
if (state == amd::Sampler::StateFilterLinear) {
|
||||
samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint;
|
||||
}
|
||||
else {
|
||||
samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint;
|
||||
}
|
||||
samplerInfo.filter.mipFilter = Pal::MipFilterPoint;
|
||||
}
|
||||
else if (mipFilter == CL_FILTER_LINEAR) {
|
||||
if (state == amd::Sampler::StateFilterLinear) {
|
||||
samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear;
|
||||
}
|
||||
else {
|
||||
samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear;
|
||||
}
|
||||
samplerInfo.filter.mipFilter = Pal::MipFilterLinear;
|
||||
}
|
||||
|
||||
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
|
||||
|
||||
@@ -91,10 +91,10 @@ public:
|
||||
//! Needed for OpenGL objects on CPU device
|
||||
|
||||
virtual bool bindExternalDevice(
|
||||
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
|
||||
virtual bool unbindExternalDevice(
|
||||
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
|
||||
//! Releases non-blocking map target memory
|
||||
virtual void freeMapTarget(amd::Memory& mem, void* target) {}
|
||||
@@ -369,17 +369,11 @@ public:
|
||||
|
||||
//! Attempt to bind with external graphics API's device/context
|
||||
virtual bool bindExternalDevice(
|
||||
intptr_t type,
|
||||
void* pDevice,
|
||||
void* pContext,
|
||||
bool validateOnly);
|
||||
uint flags, void* pDevice, void* pContext, bool validateOnly);
|
||||
|
||||
//! Attempt to unbind with external graphics API's device/context
|
||||
virtual bool unbindExternalDevice(
|
||||
intptr_t type,
|
||||
void* pDevice,
|
||||
void* pContext,
|
||||
bool validateOnly);
|
||||
uint flags, void* pDevice, void* pContext, bool validateOnly);
|
||||
|
||||
//! Validates kernel before execution
|
||||
virtual bool validateKernel(
|
||||
|
||||
@@ -387,40 +387,49 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
|
||||
return false;
|
||||
}
|
||||
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
|
||||
Resource::MemoryType type = Resource::RemoteUSWC;
|
||||
if (flags_.internalKernel_) {
|
||||
type = Resource::RemoteUSWC;
|
||||
}
|
||||
// Initialize kernel ISA code
|
||||
if (code_ && code_->create(type)) {
|
||||
address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
|
||||
// Copy only amd_kernel_code_t
|
||||
memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
|
||||
code_->unmap(nullptr);
|
||||
}
|
||||
else {
|
||||
LogError("Failed to allocate ISA code!");
|
||||
return false;
|
||||
// Allocate HW resources for the real program only
|
||||
if (!prog().isNull()) {
|
||||
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
|
||||
Resource::MemoryType type = Resource::RemoteUSWC;
|
||||
if (flags_.internalKernel_) {
|
||||
type = Resource::RemoteUSWC;
|
||||
}
|
||||
// Initialize kernel ISA code
|
||||
if (code_ && code_->create(type)) {
|
||||
address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
|
||||
// Copy only amd_kernel_code_t
|
||||
memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
|
||||
code_->unmap(nullptr);
|
||||
}
|
||||
else {
|
||||
LogError("Failed to allocate ISA code!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
|
||||
"Scratch must be DWORD aligned");
|
||||
workGroupInfo_.scratchRegs_ =
|
||||
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
||||
/*
|
||||
workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
|
||||
workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
|
||||
workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
|
||||
*/
|
||||
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
||||
workGroupInfo_.localMemSize_ =
|
||||
workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
|
||||
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
||||
|
||||
|
||||
if (!prog().isNull()) {
|
||||
workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
|
||||
workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
|
||||
workGroupInfo_.preferredSizeMultiple_ =
|
||||
workGroupInfo_.wavefrontPerSIMD_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize;
|
||||
}
|
||||
else {
|
||||
workGroupInfo_.availableSGPRs_ = 104;
|
||||
workGroupInfo_.availableVGPRs_ = 256;
|
||||
workGroupInfo_.preferredSizeMultiple_ =
|
||||
workGroupInfo_.wavefrontPerSIMD_ = 64;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -633,10 +642,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate HW resources for the real program only
|
||||
if (!prog().isNull()) {
|
||||
aqlCreateHWInfo(sym);
|
||||
}
|
||||
aqlCreateHWInfo(sym);
|
||||
|
||||
// Pull out metadata from the ELF
|
||||
size_t sizeOfArgList;
|
||||
|
||||
@@ -153,7 +153,7 @@ public:
|
||||
{ return cpuAqlCode_->workgroup_group_segment_byte_size; }
|
||||
|
||||
//! Returns pointer on CPU to AQL code info
|
||||
const void* cpuAqlCode() const { return cpuAqlCode_; }
|
||||
const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; }
|
||||
|
||||
//! Returns memory object with AQL code
|
||||
pal::Memory* gpuAqlCode() const { return code_; }
|
||||
|
||||
@@ -505,7 +505,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
|
||||
hsa_agent_t agent;
|
||||
agent.handle = 1;
|
||||
if (!isNull() && hsaLoad) {
|
||||
executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
|
||||
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
|
||||
if (executable_ == nullptr) {
|
||||
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
||||
return false;
|
||||
|
||||
@@ -55,6 +55,11 @@ public:
|
||||
void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) override;
|
||||
|
||||
void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size) override { return false; }
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@ VirtualGPU::Queue::Create(
|
||||
Pal::QueueCreateInfo qCreateInfo = {};
|
||||
qCreateInfo.engineType = queueType;
|
||||
qCreateInfo.engineIndex = engineIdx;
|
||||
qCreateInfo.aqlQueue = true;
|
||||
|
||||
// Find queue object size
|
||||
size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
|
||||
@@ -181,8 +182,10 @@ VirtualGPU::Queue::flush()
|
||||
memRef.push_back(it->first);
|
||||
}
|
||||
}
|
||||
|
||||
if (memRef.size() != 0) {
|
||||
iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
|
||||
iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_,
|
||||
Pal::GpuMemoryRefCantTrim);
|
||||
}
|
||||
|
||||
// Submit command buffer to OS
|
||||
@@ -1982,12 +1985,12 @@ VirtualGPU::submitKernelInternal(
|
||||
eventBegin(MainEngine);
|
||||
if (nullptr == scratch) {
|
||||
iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0,
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
|
||||
}
|
||||
else {
|
||||
iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(),
|
||||
scratch->size_, scratch->offset_,
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
|
||||
}
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
|
||||
|
||||
@@ -69,7 +69,8 @@ public:
|
||||
|
||||
void addMemRef(Pal::IGpuMemory* iMem) const
|
||||
{
|
||||
iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
|
||||
iDev_->AddGpuMemoryReferences(1, &iMem, NULL,
|
||||
Pal::GpuMemoryRefCantTrim);
|
||||
}
|
||||
void removeMemRef(Pal::IGpuMemory* iMem) const
|
||||
{
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user