From a340e36035115ced77bc4bc2acb2fb18c261b9c7 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 23 Mar 2016 18:09:21 -0400 Subject: [PATCH] P4 to Git Change 1250684 by gandryey@gera-w8 on 2016/03/23 17:59:05 SWDEV-86035 - Add PAL backend to OpenCL - Update PAL backend to match the latests PAL interfaces Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile.palbe#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#2 edit --- rocclr/runtime/device/pal/palblit.cpp | 12 ++++- rocclr/runtime/device/pal/paldevice.cpp | 67 +++++++++--------------- rocclr/runtime/device/pal/paldevice.hpp | 14 ++--- rocclr/runtime/device/pal/palkernel.cpp | 58 +++++++++++--------- rocclr/runtime/device/pal/palkernel.hpp | 2 +- rocclr/runtime/device/pal/palprogram.cpp | 2 +- rocclr/runtime/device/pal/palprogram.hpp | 5 ++ rocclr/runtime/device/pal/palvirtual.cpp | 9 ++-- rocclr/runtime/device/pal/palvirtual.hpp | 3 +- 9 files changed, 87 insertions(+), 85 deletions(-) diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp index 386926d714..ab5f713c8b 100644 --- a/rocclr/runtime/device/pal/palblit.cpp +++ b/rocclr/runtime/device/pal/palblit.cpp @@ -937,6 +937,8 @@ KernelBlitManager::copyBufferToImage( static const bool CopyRect = false; // Flush DMA for ASYNC copy static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; if (setup_.disableCopyBufferToImage_) { result = DmaBlitManager::copyBufferToImage( @@ -947,7 +949,9 @@ KernelBlitManager::copyBufferToImage( } // Check if buffer is in system memory with direct access else if (gpuMem(srcMemory).isHostMemDirectAccess() && - (rowPitch == 0) && (slicePitch == 0)) { + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && + ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { // First attempt to do this all with DMA, // but there are restriciton with older hardware if (dev().settings().imageDMA_) { @@ -1327,6 +1331,8 @@ KernelBlitManager::copyImageToBuffer( static const bool CopyRect = false; // Flush DMA for ASYNC copy static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; if (setup_.disableCopyImageToBuffer_) { result = HostBlitManager::copyImageToBuffer( @@ -1337,7 +1343,9 @@ KernelBlitManager::copyImageToBuffer( } // Check if buffer is in system memory with direct access else if (gpuMem(dstMemory).isHostMemDirectAccess() && - (rowPitch == 0) && (slicePitch == 0)) { + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && + ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { // First attempt to do this all with DMA, // but there are restriciton with older hardware if (dev().settings().imageDMA_) { diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 0937ed086a..f8b754fda3 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -175,10 +175,10 @@ void NullDevice::fillDeviceInfo( info_.maxWorkItemDimensions_ = 3; info_.maxComputeUnits_ = - palProp.gfxipProperties.engineCore.numOfShaderEngines * - palProp.gfxipProperties.engineCore.numOfShaderArrays * - palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray; - info_.numberOfShaderEngines = palProp.gfxipProperties.engineCore.numOfShaderEngines; + palProp.gfxipProperties.shaderCore.numShaderEngines * + palProp.gfxipProperties.shaderCore.numShaderArrays * + palProp.gfxipProperties.shaderCore.numCusPerShaderArray; + info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. // For example, float4 is not faster than float as long as all threads fetch the same @@ -417,7 +417,7 @@ void NullDevice::fillDeviceInfo( info_.simdPerCU_ = hwInfo()->simdPerCU_; info_.simdWidth_ = hwInfo()->simdWidth_; info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; - info_.wavefrontWidth_ = palProp.gfxipProperties.engineCore.wavefrontSize; + info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize; //info_.globalMemChannels_ = calAttr.memBusWidth / 32; //info_.globalMemChannelBanks_ = calAttr.numMemBanks; info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; @@ -1541,35 +1541,34 @@ Device::createView(amd::Memory& owner, const device::Memory& parent) const //! Attempt to bind with external graphics API's device/context bool -Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly) +Device::bindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly) { assert(pDevice); - switch (type) { #ifdef _WIN32 - case CL_CONTEXT_D3D10_DEVICE_KHR: + if (flags & amd::Context::Flags::D3D10DeviceKhr) { if (!associateD3D10Device(pDevice)) { LogError("Failed gslD3D10Associate()"); return false; } - break; - case CL_CONTEXT_D3D11_DEVICE_KHR: + } + else if (flags & amd::Context::Flags::D3D11DeviceKhr) { if (!associateD3D11Device(pDevice)) { LogError("Failed gslD3D11Associate()"); return false; } - break; - case CL_CONTEXT_ADAPTER_D3D9_KHR: - case CL_CONTEXT_ADAPTER_D3D9EX_KHR: + } + else if (flags & (amd::Context::Flags::D3D9DeviceKhr | + amd::Context::Flags::D3D9DeviceEXKhr)) { if (!associateD3D9Device(pDevice)) { LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); return false; } - break; - case CL_CONTEXT_ADAPTER_DXVA_KHR: - break; + } + else if (flags & amd::Context::Flags::D3D9DeviceVAKhr) { + } #endif //_WIN32 - case CL_GL_CONTEXT_KHR: + if (flags & amd::Context::Flags::GLDeviceKhr) { // Attempt to associate GSL-OGL if (!glAssociate(pContext, pDevice)) { if (!validateOnly) { @@ -1577,20 +1576,15 @@ Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool va } return false; } - break; - default: - LogError("Unknown external device!"); - return false; - break; } return true; } bool -Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly) +Device::unbindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly) { - if (type != CL_GL_CONTEXT_KHR) { + if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) { return true; } @@ -1820,8 +1814,8 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu) // Calculate the size of the scratch buffer for a queue uint32_t numTotalCUs = info().maxComputeUnits_; uint32_t numMaxWaves = - properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs; - scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize * + properties().gfxipProperties.shaderCore.maxScratchWavesPerCu * numTotalCUs; + scratchBuf->size_ = properties().gfxipProperties.shaderCore.wavefrontSize * scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t); scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); scratchBuf->offset_ = offset; @@ -1920,8 +1914,7 @@ Device::fillHwSampler( samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack; - // Assign defaults - samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase; + samplerInfo.filter.zFilter = Pal::XyFilterPoint; samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask); @@ -1956,24 +1949,16 @@ Device::fillHwSampler( // Program texture filter mode if (state == amd::Sampler::StateFilterLinear) { - samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase; + samplerInfo.filter.magnification = Pal::XyFilterLinear; + samplerInfo.filter.minification = Pal::XyFilterLinear; + samplerInfo.filter.zFilter = Pal::ZFilterLinear; } if (mipFilter == CL_FILTER_NEAREST) { - if (state == amd::Sampler::StateFilterLinear) { - samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint; - } - else { - samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint; - } + samplerInfo.filter.mipFilter = Pal::MipFilterPoint; } else if (mipFilter == CL_FILTER_LINEAR) { - if (state == amd::Sampler::StateFilterLinear) { - samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear; - } - else { - samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear; - } + samplerInfo.filter.mipFilter = Pal::MipFilterLinear; } iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp index 8fe3347d46..7439682df1 100644 --- a/rocclr/runtime/device/pal/paldevice.hpp +++ b/rocclr/runtime/device/pal/paldevice.hpp @@ -91,10 +91,10 @@ public: //! Needed for OpenGL objects on CPU device virtual bool bindExternalDevice( - intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; } + uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; } virtual bool unbindExternalDevice( - intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; } + uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; } //! Releases non-blocking map target memory virtual void freeMapTarget(amd::Memory& mem, void* target) {} @@ -369,17 +369,11 @@ public: //! Attempt to bind with external graphics API's device/context virtual bool bindExternalDevice( - intptr_t type, - void* pDevice, - void* pContext, - bool validateOnly); + uint flags, void* pDevice, void* pContext, bool validateOnly); //! Attempt to unbind with external graphics API's device/context virtual bool unbindExternalDevice( - intptr_t type, - void* pDevice, - void* pContext, - bool validateOnly); + uint flags, void* pDevice, void* pContext, bool validateOnly); //! Validates kernel before execution virtual bool validateKernel( diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 268bb9eebc..3b2e7f5617 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -387,40 +387,49 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { return false; } - code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align)); - Resource::MemoryType type = Resource::RemoteUSWC; - if (flags_.internalKernel_) { - type = Resource::RemoteUSWC; - } - // Initialize kernel ISA code - if (code_ && code_->create(type)) { - address cpuCodePtr = static_cast
(code_->map(nullptr, Resource::WriteOnly)); - // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); - code_->unmap(nullptr); - } - else { - LogError("Failed to allocate ISA code!"); - return false; + // Allocate HW resources for the real program only + if (!prog().isNull()) { + code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align)); + Resource::MemoryType type = Resource::RemoteUSWC; + if (flags_.internalKernel_) { + type = Resource::RemoteUSWC; + } + // Initialize kernel ISA code + if (code_ && code_->create(type)) { + address cpuCodePtr = static_cast
(code_->map(nullptr, Resource::WriteOnly)); + // Copy only amd_kernel_code_t + memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); + code_->unmap(nullptr); + } + else { + LogError("Failed to allocate ISA code!"); + return false; + } } assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); workGroupInfo_.scratchRegs_ = amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); -/* - workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); - workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); - workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; - workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; -*/ workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size; workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; workGroupInfo_.usedStackSize_ = 0; workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; - + + if (!prog().isNull()) { + workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs; + workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs; + workGroupInfo_.preferredSizeMultiple_ = + workGroupInfo_.wavefrontPerSIMD_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize; + } + else { + workGroupInfo_.availableSGPRs_ = 104; + workGroupInfo_.availableVGPRs_ = 256; + workGroupInfo_.preferredSizeMultiple_ = + workGroupInfo_.wavefrontPerSIMD_ = 64; + } return true; } @@ -633,10 +642,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) } } - // Allocate HW resources for the real program only - if (!prog().isNull()) { - aqlCreateHWInfo(sym); - } + aqlCreateHWInfo(sym); // Pull out metadata from the ELF size_t sizeOfArgList; diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp index f2b6c870b3..73d9dbddff 100644 --- a/rocclr/runtime/device/pal/palkernel.hpp +++ b/rocclr/runtime/device/pal/palkernel.hpp @@ -153,7 +153,7 @@ public: { return cpuAqlCode_->workgroup_group_segment_byte_size; } //! Returns pointer on CPU to AQL code info - const void* cpuAqlCode() const { return cpuAqlCode_; } + const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; } //! Returns memory object with AQL code pal::Memory* gpuAqlCode() const { return code_; } diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp index 2384396b0e..383b170552 100644 --- a/rocclr/runtime/device/pal/palprogram.cpp +++ b/rocclr/runtime/device/pal/palprogram.cpp @@ -505,7 +505,7 @@ HSAILProgram::linkImpl(amd::option::Options* options) hsa_agent_t agent; agent.handle = 1; if (!isNull() && hsaLoad) { - executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr); + executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); if (executable_ == nullptr) { buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; return false; diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp index e4f72d7bf3..e78782d01e 100644 --- a/rocclr/runtime/device/pal/palprogram.hpp +++ b/rocclr/runtime/device/pal/palprogram.hpp @@ -55,6 +55,11 @@ public: void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t offset) override { + return nullptr; + } + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override { return false; } diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index ae642e1dc7..1cc00ef49f 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -43,6 +43,7 @@ VirtualGPU::Queue::Create( Pal::QueueCreateInfo qCreateInfo = {}; qCreateInfo.engineType = queueType; qCreateInfo.engineIndex = engineIdx; + qCreateInfo.aqlQueue = true; // Find queue object size size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); @@ -181,8 +182,10 @@ VirtualGPU::Queue::flush() memRef.push_back(it->first); } } + if (memRef.size() != 0) { - iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_); + iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_, + Pal::GpuMemoryRefCantTrim); } // Submit command buffer to OS @@ -1982,12 +1985,12 @@ VirtualGPU::submitKernelInternal( eventBegin(MainEngine); if (nullptr == scratch) { iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0, - hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff); + hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff); } else { iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(), scratch->size_, scratch->offset_, - hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff); + hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff); } eventEnd(MainEngine, gpuEvent); diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 1f7ca1307b..8183687345 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -69,7 +69,8 @@ public: void addMemRef(Pal::IGpuMemory* iMem) const { - iDev_->AddGpuMemoryReferences(1, &iMem, NULL); + iDev_->AddGpuMemoryReferences(1, &iMem, NULL, + Pal::GpuMemoryRefCantTrim); } void removeMemRef(Pal::IGpuMemory* iMem) const {