From 5ea54a902aa22e513dc6697352ef3761ee5b9b2d Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 8 May 2019 19:22:02 -0400 Subject: [PATCH] P4 to Git Change 1780358 by gandryey@gera-win10 on 2019/05/08 18:46:22 SWDEV-79445 - OCL generic changes and code clean-up - Run google autoformat over the PAL backend. It will allow to enable autoformat in VS for the future changes. - No functional changes Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#52 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#133 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#37 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#78 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#28 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#93 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#73 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#79 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#22 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#132 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#60 edit [ROCm/clr commit: 699a12bfa29aaecd187031e974b015c5176a356b] --- .../runtime/device/pal/palappprofile.cpp | 5 +- .../runtime/device/pal/palappprofile.hpp | 2 +- .../clr/rocclr/runtime/device/pal/palblit.cpp | 106 +- .../clr/rocclr/runtime/device/pal/palblit.hpp | 9 +- .../rocclr/runtime/device/pal/palconstbuf.cpp | 41 +- .../rocclr/runtime/device/pal/palconstbuf.hpp | 81 +- .../rocclr/runtime/device/pal/palcounters.cpp | 12 +- .../rocclr/runtime/device/pal/palcounters.hpp | 3 +- .../rocclr/runtime/device/pal/paldebugger.hpp | 8 +- .../runtime/device/pal/paldebugmanager.cpp | 2 +- .../clr/rocclr/runtime/device/pal/paldefs.hpp | 174 +-- .../rocclr/runtime/device/pal/paldevice.cpp | 410 ++++--- .../rocclr/runtime/device/pal/paldevice.hpp | 98 +- .../runtime/device/pal/paldeviced3d10.cpp | 18 +- .../runtime/device/pal/paldeviced3d11.cpp | 18 +- .../runtime/device/pal/paldeviced3d9.cpp | 18 +- .../rocclr/runtime/device/pal/paldevicegl.cpp | 1037 +++++++++-------- .../rocclr/runtime/device/pal/palgpuopen.cpp | 296 +++-- .../rocclr/runtime/device/pal/palgpuopen.hpp | 310 +++-- .../rocclr/runtime/device/pal/palkernel.cpp | 98 +- .../rocclr/runtime/device/pal/palkernel.hpp | 36 +- .../rocclr/runtime/device/pal/palmemory.cpp | 30 +- .../rocclr/runtime/device/pal/palmemory.hpp | 32 +- .../rocclr/runtime/device/pal/palprintf.hpp | 35 +- .../rocclr/runtime/device/pal/palprogram.cpp | 69 +- .../rocclr/runtime/device/pal/palprogram.hpp | 59 +- .../rocclr/runtime/device/pal/palresource.cpp | 348 +++--- .../rocclr/runtime/device/pal/palresource.hpp | 140 ++- .../rocclr/runtime/device/pal/palsettings.cpp | 12 +- .../rocclr/runtime/device/pal/palsettings.hpp | 107 +- .../runtime/device/pal/paltimestamp.hpp | 5 +- .../rocclr/runtime/device/pal/palvirtual.cpp | 443 ++++--- .../rocclr/runtime/device/pal/palvirtual.hpp | 203 ++-- 33 files changed, 2119 insertions(+), 2146 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp index e703204719..8f804911a7 100644 --- a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp @@ -11,8 +11,9 @@ namespace pal { AppProfile::AppProfile() : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) { - propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)}); + propertyDataMap_.insert( + {"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)}); propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)}); } -} +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp index a337517cd6..3b7f3e441d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp @@ -20,4 +20,4 @@ class AppProfile : public amd::AppProfile { bool enableHighPerformanceState_; bool reportAsOCL12Device_; }; -} +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index 524979ee97..4370f46317 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -280,8 +280,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M amd::Coord3D copySize(tmpSize, 0, 0); // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, - src, copySize, flags)) { + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize, + flags)) { return false; } @@ -296,7 +296,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M srcOffset += tmpSize; if ((srcOffset + tmpSize) > gpu().xferWrite().MaxSize()) { srcOffset = 0; - flags = 0; + flags = 0; } else { flags = Resource::NoWait; } @@ -310,7 +310,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBuffer_ || (gpuMem(dstMemory).isHostMemDirectAccess() && - (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || + (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } else { @@ -335,7 +335,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, // Copy memory, using pinning while (dstSize > 0) { size_t tmpSize; - // If it's the first iterarion, then readjust the copy size + // If it's the first iterarion, then readjust the copy size // to include alignment if (first) { pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); @@ -398,7 +398,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBufferRect_ || (dstMemory.isHostMemDirectAccess() && - (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || + (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); } else { @@ -586,8 +586,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory entire, rowPitch, slicePitch); } else { // Use PAL path for a transfer - result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, - size, gpuMem(dstMemory)); + result = + gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); // Check if a HostBlit transfer is required if (completeOperation_ && !result) { @@ -947,8 +947,8 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo void* param = kernel->parameters().values() + desc.offset_; assert((desc.type_ == T_POINTER || value != NULL || - (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) && - "not a valid local mem arg"); + (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) && + "not a valid local mem arg"); uint32_t uint32_value = 0; uint64_t uint64_value = 0; @@ -957,14 +957,15 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { if ((value == NULL) || (static_cast(value) == NULL)) { reinterpret_cast(kernel->parameters().values() + - kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr; + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = + nullptr; } else { // convert cl_mem to amd::Memory*, return false if invalid. - LP64_SWITCH(uint32_value, uint64_value) = static_cast(( - *static_cast(value))->virtualAddress()); + LP64_SWITCH(uint32_value, uint64_value) = + static_cast((*static_cast(value))->virtualAddress()); reinterpret_cast(kernel->parameters().values() + - kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = - *static_cast(value); + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = + *static_cast(value); // Note: Special case for image SRD, which is 64 bit always if (LP64_SWITCH(true, false) && (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) { @@ -1018,8 +1019,8 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, bool releaseView = false; bool result = false; amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_); - bool swapLayer = (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && - dev().settings().gfx10Plus_; + bool swapLayer = + (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_; // Find unsupported formats for (uint i = 0; i < RejectedFormatDataTotal; ++i) { @@ -1078,10 +1079,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, // Swap the Y and Z components, apparently gfx10 HW expects // layer in Z if (swapLayer) { - globalWorkSize[2] = globalWorkSize[1]; - globalWorkSize[1] = 1; - localWorkSize[2] = localWorkSize[1]; - localWorkSize[1] = 1; + globalWorkSize[2] = globalWorkSize[1]; + globalWorkSize[1] = 1; + localWorkSize[2] = localWorkSize[1]; + localWorkSize[1] = 1; } } else { globalWorkSize[0] = amd::alignUp(size[0], 8); @@ -1114,10 +1115,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; if (swapLayer) { - dstOrg[2] = dstOrg[1]; - dstOrg[1] = 0; - copySize[2] = copySize[1]; - copySize[1] = 1; + dstOrg[2] = dstOrg[1]; + dstOrg[1] = 0; + copySize[2] = copySize[1]; + copySize[1] = 1; } setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); @@ -1338,8 +1339,8 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, bool releaseView = false; bool result = false; amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); - bool swapLayer = (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && - dev().settings().gfx10Plus_; + bool swapLayer = + (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_; // Find unsupported formats for (uint i = 0; i < RejectedFormatDataTotal; ++i) { @@ -1398,10 +1399,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, // Swap the Y and Z components, apparently gfx10 HW expects // layer in Z if (swapLayer) { - globalWorkSize[2] = globalWorkSize[1]; - globalWorkSize[1] = 1; - localWorkSize[2] = localWorkSize[1]; - localWorkSize[1] = 1; + globalWorkSize[2] = globalWorkSize[1]; + globalWorkSize[1] = 1; + localWorkSize[2] = localWorkSize[1]; + localWorkSize[1] = 1; } } else { globalWorkSize[0] = amd::alignUp(size[0], 8); @@ -1426,10 +1427,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; if (swapLayer) { - srcOrg[2] = srcOrg[1]; - srcOrg[1] = 0; - copySize[2] = copySize[1]; - copySize[1] = 1; + srcOrg[2] = srcOrg[1]; + srcOrg[1] = 0; + copySize[2] = copySize[1]; + copySize[1] = 1; } setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); uint32_t memFmtSize = gpuMem(srcMemory).elementSize(); @@ -1570,7 +1571,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst // Program source origin cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && - dev().settings().gfx10Plus_) { + dev().settings().gfx10Plus_) { srcOrg[3] = 1; } setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); @@ -1578,7 +1579,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst // Program destinaiton origin cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; if ((gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && - dev().settings().gfx10Plus_) { + dev().settings().gfx10Plus_) { dstOrg[3] = 1; } setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); @@ -1700,16 +1701,15 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor amdMemory = pinHostMemory(srcHost, pinSize, partial); if (amdMemory == nullptr) { // Force SW copy - result = HostBlitManager::writeImage(srcHost, dstMemory, - origin, size, rowPitch, slicePitch, entire); + result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); synchronize(); return result; } // Get device memory for this virtual device srcMemory = dev().getGpuMemory(amdMemory); pinned = true; - } - else { + } else { srcMemory = &gpu().xferWrite().Acquire(pinSize); srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait); pinned = false; @@ -1951,7 +1951,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBuffer_ || (gpuMem(dstMemory).isHostMemDirectAccess() && - (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || + (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); synchronize(); @@ -2002,7 +2002,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBufferRect_ || (gpuMem(dstMemory).isHostMemDirectAccess() && - (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || + (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); synchronize(); @@ -2206,8 +2206,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, size_t localWorkSize[3]; Memory* memView = &gpuMem(memory); amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); - bool swapLayer = (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && - dev().settings().gfx10Plus_; + bool swapLayer = + (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_; // Program the kernels workload depending on the fill dimensions fillType = FillImage; @@ -2274,10 +2274,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, // Swap the Y and Z components, apparently gfx10 HW expects // layer in Z if (swapLayer) { - globalWorkSize[2] = globalWorkSize[1]; - globalWorkSize[1] = 1; - localWorkSize[2] = localWorkSize[1]; - localWorkSize[1] = 1; + globalWorkSize[2] = globalWorkSize[1]; + globalWorkSize[1] = 1; + localWorkSize[2] = localWorkSize[1]; + localWorkSize[1] = 1; } } else { globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); @@ -2297,10 +2297,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0}; cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; if (swapLayer) { - fillOrigin[2] = fillOrigin[1]; - fillOrigin[1] = 0; - fillSize[2] = fillSize[1]; - fillSize[1] = 1; + fillOrigin[2] = fillOrigin[1]; + fillOrigin[1] = 0; + fillSize[2] = fillSize[1]; + fillSize[1] = 1; } setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp index fe52ac2a59..4c9769d678 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp @@ -27,7 +27,7 @@ class DmaBlitManager : public device::HostBlitManager { //! Constructor DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits Setup setup = Setup() //!< Specifies HW accelerated blits - ); + ); //! Destructor virtual ~DmaBlitManager() {} @@ -211,7 +211,7 @@ class KernelBlitManager : public DmaBlitManager { //! Constructor KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits Setup setup = Setup() //!< Specifies HW accelerated blits - ); + ); //! Destructor virtual ~KernelBlitManager(); @@ -382,7 +382,7 @@ class KernelBlitManager : public DmaBlitManager { //! Creates a program for all blit operations bool createProgram(Device& device //!< Device object - ); + ); //! Creates a view memory object Memory* createView(const Memory& parent, //!< Parent memory object @@ -409,4 +409,5 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = { "fillImage", "scheduler", }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp index ccd6dfb583..3bf5be1fd0 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -11,12 +11,12 @@ namespace pal { // ================================================================================================ ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size) - : gpu_(gpu) - , pool_(MaxNumberOfBuffers) - , activeBuffer_(0) - , size_(size) - , wrtOffset_(0) - , wrtAddress_(nullptr) {} + : gpu_(gpu), + pool_(MaxNumberOfBuffers), + activeBuffer_(0), + size_(size), + wrtOffset_(0), + wrtAddress_(nullptr) {} // ================================================================================================ void ManagedBuffer::release() { @@ -40,8 +40,8 @@ bool ManagedBuffer::create(Resource::MemoryType type) { pool_[i].buf->memRef()->gpu_ = &gpu_; void* wrtAddress = pool_[i].buf->map(&gpu_); if (wrtAddress == nullptr) { - LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); - return false; + LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); + return false; } // Make sure OCL touches every buffer in the queue to avoid delays on the first submit uint dummy = 0; @@ -94,15 +94,10 @@ void ManagedBuffer::pinGpuEvent() { // ================================================================================================ ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size) - : mbuf_(mbuf) - , sys_mem_copy_(nullptr) - , size_(size) -{} + : mbuf_(mbuf), sys_mem_copy_(nullptr), size_(size) {} // ================================================================================================ -ConstantBuffer::~ConstantBuffer() { - amd::AlignedMemory::deallocate(sys_mem_copy_); -} +ConstantBuffer::~ConstantBuffer() { amd::AlignedMemory::deallocate(sys_mem_copy_); } // ================================================================================================ bool ConstantBuffer::Create() { @@ -118,8 +113,8 @@ bool ConstantBuffer::Create() { // ================================================================================================ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const { - uint64_t vm_address; - address cpu_address = mbuf_.reserve(size, &vm_address); + uint64_t vm_address; + address cpu_address = mbuf_.reserve(size, &vm_address); // Update memory with new CB data memcpy(cpu_address, sys_mem_copy_, size); return vm_address; @@ -127,8 +122,8 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const { // ================================================================================================ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const { - uint64_t vm_address; - address cpu_address = mbuf_.reserve(size, &vm_address); + uint64_t vm_address; + address cpu_address = mbuf_.reserve(size, &vm_address); // Update memory with new CB data memcpy(cpu_address, sysmem, size); return vm_address; @@ -136,9 +131,7 @@ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const // ================================================================================================ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size) - : buffer_view_(device, size) - , mbuf_(mbuf) - , size_(size) { + : buffer_view_(device, size), mbuf_(mbuf), size_(size) { // Create a view for access Resource::ViewParams params = {}; params.gpu_ = &mbuf_.gpu(); @@ -151,9 +144,9 @@ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size) // ================================================================================================ Memory& XferBuffer::Acquire(uint32_t size) { - uint64_t vm_address; + uint64_t vm_address; // Reserve space in the managed buffer - address cpu_address = mbuf_.reserve(size, &vm_address); + address cpu_address = mbuf_.reserve(size, &vm_address); // Update a view for access buffer_view_.updateView(mbuf_.activeMemory(), vm_address - mbuf_.vmAddress(), size); return buffer_view_; diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp index c1853b0537..5a2279eec5 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp @@ -12,9 +12,9 @@ namespace pal { class ManagedBuffer : public amd::EmbeddedObject { public: //! Constructor for the ConstBuffer class - ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object - uint32_t size //!< size of the managed buffers in bytes - ); + ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object + uint32_t size //!< size of the managed buffers in bytes + ); ~ManagedBuffer() {} //! Creates the managed buffers @@ -50,8 +50,8 @@ class ManagedBuffer : public amd::EmbeddedObject { private: struct TimeStampedBuffer { - Memory* buf; - GpuEvent events[AllEngines]; + Memory* buf; + GpuEvent events[AllEngines]; }; //! The maximum number of the managed buffers @@ -63,21 +63,21 @@ class ManagedBuffer : public amd::EmbeddedObject { //! Disable operator= ManagedBuffer& operator=(const ManagedBuffer&) = delete; - VirtualGPU& gpu_; //!< Virtual GPU object - std::vector pool_; //!< Buffers for management - uint32_t activeBuffer_; //!< Current active buffer - uint32_t size_; //!< Constant buffer size - uint32_t wrtOffset_; //!< Current write offset - address wrtAddress_; //!< Write address in CB + VirtualGPU& gpu_; //!< Virtual GPU object + std::vector pool_; //!< Buffers for management + uint32_t activeBuffer_; //!< Current active buffer + uint32_t size_; //!< Constant buffer size + uint32_t wrtOffset_; //!< Current write offset + address wrtAddress_; //!< Write address in CB }; //! Constant buffer class ConstantBuffer : public amd::HeapObject { -public: + public: //! Constructor for the ConstBuffer class ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer - uint32_t size //!< Max size of the constant buffer - ); + uint32_t size //!< Max size of the constant buffer + ); //! Destructor for the ConstBuffer class ~ConstantBuffer(); @@ -86,18 +86,18 @@ public: bool Create(); /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW - * - * \return GPU address for the uploaded data - */ + * + * \return GPU address for the uploaded data + */ uint64_t UploadDataToHw(uint32_t size //!< real data size for upload ) const; /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW - * - * \return GPU address for the uploaded data - */ + * + * \return GPU address for the uploaded data + */ uint64_t UploadDataToHw(const void* sysmem, //!< Pointer to the data for upload - uint32_t size //!< Real data size for upload + uint32_t size //!< Real data size for upload ) const; //! Returns a pointer to the system memory copy for CB @@ -106,52 +106,55 @@ public: //! Returns active GPU buffer Memory* ActiveMemory() const { return mbuf_.activeMemory(); } -private: + private: //! Disable copy constructor ConstantBuffer(const ConstantBuffer&) = delete; //! Disable operator= ConstantBuffer& operator=(const ConstantBuffer&) = delete; - ManagedBuffer& mbuf_; //!< Managed buffer on GPU - address sys_mem_copy_; //!< System memory copy - uint32_t size_; //!< Constant buffer size + ManagedBuffer& mbuf_; //!< Managed buffer on GPU + address sys_mem_copy_; //!< System memory copy + uint32_t size_; //!< Constant buffer size }; //! Staging buffer class XferBuffer : public amd::EmbeddedObject { -public: + public: //! Constructor for the ConstBuffer class - XferBuffer(const Device& device, //!< Active GPU device + XferBuffer(const Device& device, //!< Active GPU device ManagedBuffer& mbuf, //!< Managed buffer - uint32_t size //!< Maximum size of the transfer buffer + uint32_t size //!< Maximum size of the transfer buffer ); //! Destructor for the ConstBuffer class ~XferBuffer() {} /*! \brief Acquires free memory from the managed buffer - * - * \return GPU memory object associated with free memory - */ - Memory& Acquire(uint32_t size //!< data size for transfers - ); + * + * \return GPU memory object associated with free memory + */ + Memory& Acquire(uint32_t size //!< data size for transfers + ); //! Releases memory object used in the staging transfer void Release(Memory& mem //!< Memory object for release - ) { buffer_view_.updateView(nullptr, 0, 0); } + ) { + buffer_view_.updateView(nullptr, 0, 0); + } size_t MaxSize() const { return static_cast(size_); } -private: + private: //! Disable copy constructor XferBuffer(const XferBuffer&) = delete; //! Disable operator= XferBuffer& operator=(const XferBuffer&) = delete; - Memory buffer_view_; //!< Buffer view returned in the acquire - ManagedBuffer& mbuf_; //!< Managed buffer on GPU - uint32_t size_; //!< Mx staging buffer size + Memory buffer_view_; //!< Buffer view returned in the acquire + ManagedBuffer& mbuf_; //!< Managed buffer on GPU + uint32_t size_; //!< Mx staging buffer size }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp index 2be9c3d50e..3af5ca0cf2 100644 --- a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp @@ -676,12 +676,12 @@ void PerfCounter::convertInfo() { break; case Pal::GfxIpLevel::GfxIp10: case Pal::GfxIpLevel::GfxIp10_1: - if (info_.blockIndex_ < gfx10BlockIdPal.size()) { - auto p = gfx10BlockIdPal[info_.blockIndex_]; - info_.blockIndex_ = std::get<0>(p); - info_.counterIndex_ = std::get<1>(p); - } - break; + if (info_.blockIndex_ < gfx10BlockIdPal.size()) { + auto p = gfx10BlockIdPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; default: Unimplemented(); break; diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp index ea55cc1600..4632c8b277 100644 --- a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp @@ -84,8 +84,7 @@ class PerfCounter : public device::PerfCounter { cl_uint blockIndex, //!< HW block index cl_uint counterIndex, //!< Counter index within the block cl_uint eventIndex) //!< Event index for profiling - : gpuDevice_(device), - palRef_(palRef) { + : gpuDevice_(device), palRef_(palRef) { info_.blockIndex_ = blockIndex; info_.counterIndex_ = counterIndex; info_.eventIndex_ = eventIndex; diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp index cb1d4dd981..70812b4028 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp @@ -98,10 +98,10 @@ struct HwDebugWaveAddr { }; /*! \brief Kernel code information -* -* This structure contains the pointer of mapped kernel code for host access -* and its size (in bytes) -*/ + * + * This structure contains the pointer of mapped kernel code for host access + * and its size (in bytes) + */ struct AqlCodeInfo { amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access uint32_t aqlCodeSize_; //! size of AQL code diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp index 124de40991..f8fdac9d0e 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp @@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() { void GpuDebugManager::flushCache(uint32_t mask) { HwDbgGpuCacheMask cacheMask(mask); - //device()->xferQueue()->flushCuCaches(cacheMask); + // device()->xferQueue()->flushCuCaches(cacheMask); } diff --git a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp index 989efc51d9..fdd8213cee 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp @@ -47,9 +47,9 @@ struct GpuEvent { static constexpr uint32_t InvalidID = ((1 << 30) - 1); struct { - uint32_t id_ : 30; ///< Actual event id - uint32_t modified_ : 1; ///< Resource associated with the event was modified - uint32_t engineId_ : 1; ///< Type of the id + uint32_t id_ : 30; ///< Actual event id + uint32_t modified_ : 1; ///< Resource associated with the event was modified + uint32_t engineId_ : 1; ///< Type of the id }; //! GPU event default constructor GpuEvent() : id_(InvalidID), modified_(false), engineId_(MainEngine) {} @@ -63,8 +63,11 @@ struct GpuEvent { void invalidate() { id_ = InvalidID; } // Overwrite default assign operator to preserve modified_ field - GpuEvent& operator=(const GpuEvent& evt) - { id_ = evt.id_; engineId_ = evt.engineId_; return *this; } + GpuEvent& operator=(const GpuEvent& evt) { + id_ = evt.id_; + engineId_ = evt.engineId_; + return *this; + } }; /*! \addtogroup PAL @@ -113,87 +116,110 @@ const static uint HsaSamplerObjectAlignment = 16; const static uint DeviceQueueMaskSize = 32; struct AMDDeviceInfo { - const char* targetName_; //!< Target name - const char* machineTarget_; //!< Machine target - const char* machineTargetLC_;//!< Machine target for LC - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory - uint gfxipVersionLC_; //!< The core engine GFXIP version for LC - uint gfxipVersion_; //!< The core engine GFXIP version - bool xnackEnabled_; //!< Enable XNACK feature + const char* targetName_; //!< Target name + const char* machineTarget_; //!< Machine target + const char* machineTargetLC_; //!< Machine target for LC + uint simdPerCU_; //!< Number of SIMDs per CU + uint simdWidth_; //!< Number of workitems processed per SIMD + uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + uint memChannelBankWidth_; //!< Memory channel bank width + uint localMemSizePerCU_; //!< Local memory size per CU + uint localMemBanks_; //!< Number of banks of local memory + uint gfxipVersionLC_; //!< The core engine GFXIP version for LC + uint gfxipVersion_; //!< The core engine GFXIP version + bool xnackEnabled_; //!< Enable XNACK feature }; static const AMDDeviceInfo DeviceInfo[] = { - /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false}, - /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, - /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, - /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false}, - /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, - /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, + /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false}, + /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, + /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, + /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false}, + /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, + /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false}, - /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false}, - /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, - /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, - /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, + /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false}, + /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, + /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, + /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, - /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false}, - /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false}, - /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, - /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, + /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false}, + /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false}, + /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, + /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false}, - /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801,false}, - /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false}, - /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false}, + /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false}, + /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false}, + /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false}, - /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false}, - /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false}, - /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, - /* Ellesmere */ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, - /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, - /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, + /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false}, + /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false}, + /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, + /* Ellesmere */ + {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, + /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, + /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false}, }; // Ordering as per AsicRevision# in //depot/stg/pal/inc/core/palDevice.h and // http://confluence.amd.com/pages/viewpage.action?spaceKey=ASLC&title=AMDGPU+Target+Names static const AMDDeviceInfo Gfx9PlusSubDeviceInfo[] = { - /* Vega10 */{"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false}, - /* Vega10 XNACK */{ LIGHTNING_SWITCH("gfx900","gfx901"), "gfx901", "gfx900", - 4, 16, 1, 256, 64 * Ki, 32, 900, 901, true}, - /* Vega12 */{"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false}, - /* Vega12 XNACK */{ LIGHTNING_SWITCH("gfx904","gfx905"), "gfx905", "gfx904", - 4, 16, 1, 256, 64 * Ki, 32, 904, 905, true}, - /* Vega20 */{"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false}, - /* Vega20 XNACK */{ LIGHTNING_SWITCH("gfx906","gfx907"), "gfx907", "gfx906", - 4, 16, 1, 256, 64 * Ki, 32, 906, 907, true}, - /* Raven */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, - /* Raven XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902", - 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true}, - /* Raven2 */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, - /* Raven2 XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902", - 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true}, - /* Renoir */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, - /* Renoir XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902", - 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true}, - /* Navi10_A0 */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false }, - /* Navi10_A0 XNACK */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true }, - /* Navi10 */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false}, - /* Navi10 XNACK */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true}, - /* Navi10Lite */{"gfx1000", "gfx1000","gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false}, - /* Navi10Lite XNACK */{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true}, - /* Navi12 */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false }, - /* Navi12 XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true }, - /* Navi12Lite */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false }, - /* Navi12Lite XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true }, - /* Navi14 */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false }, - /* Navi14 XNACK */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true }, - /* UnknownDevice3 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false }, - /* UnknownDevice3 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true }, - /* UnknownDevice2 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false }, - /* UnknownDevice2 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true }, + /* Vega10 */ {"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false}, + /* Vega10 XNACK */ + {LIGHTNING_SWITCH("gfx900", "gfx901"), "gfx901", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 901, + true}, + /* Vega12 */ {"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false}, + /* Vega12 XNACK */ + {LIGHTNING_SWITCH("gfx904", "gfx905"), "gfx905", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 905, + true}, + /* Vega20 */ {"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false}, + /* Vega20 XNACK */ + {LIGHTNING_SWITCH("gfx906", "gfx907"), "gfx907", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 907, + true}, + /* Raven */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, + /* Raven XNACK */ + {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903, + true}, + /* Raven2 */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, + /* Raven2 XNACK */ + {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903, + true}, + /* Renoir */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false}, + /* Renoir XNACK */ + {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903, + true}, + /* Navi10_A0 */ + {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false}, + /* Navi10_A0 XNACK */ + {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true}, + /* Navi10 */ + {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false}, + /* Navi10 XNACK */ + {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true}, + /* Navi10Lite */ + {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false}, + /* Navi10Lite XNACK */ + {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true}, + /* Navi12 */ + {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false}, + /* Navi12 XNACK */ + {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true}, + /* Navi12Lite */ + {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false}, + /* Navi12Lite XNACK */ + {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true}, + /* Navi14 */ + {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false}, + /* Navi14 XNACK */ + {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true}, + /* UnknownDevice3 */ + {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false}, + /* UnknownDevice3 XNACK */ + {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true}, + /* UnknownDevice2 */ + {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false}, + /* UnknownDevice2 XNACK */ + {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true}, }; diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index c34f2ab003..4132b19f78 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -53,15 +53,14 @@ void PalDeviceUnload() { pal::Device::tearDown(); } namespace pal { -Util::GenericAllocator NullDevice::allocator_; +Util::GenericAllocator NullDevice::allocator_; char* Device::platformObj_; -Pal::IPlatform* Device::platform_; +Pal::IPlatform* Device::platform_; NullDevice::Compiler* NullDevice::compiler_; AppProfile Device::appProfile_; -NullDevice::NullDevice() - : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {} +NullDevice::NullDevice() : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {} bool NullDevice::init() { std::vector devices; @@ -89,8 +88,8 @@ bool NullDevice::init() { driverVersion = static_cast(devices[i])->info().driverVersion_; if (driverVersion.find("PAL") != std::string::npos) { if (static_cast(devices[i])->asicRevision() == revision) { - foundActive = true; - break; + foundActive = true; + break; } } } @@ -109,132 +108,130 @@ bool NullDevice::init() { } } } -#endif // defined(WITH_COMPILER_LIB) +#endif // defined(WITH_COMPILER_LIB) // Loop through all supported devices and create each of them - for (uint id = 0; - id < sizeof(Gfx9PlusSubDeviceInfo)/sizeof(AMDDeviceInfo); ++id) { - bool foundActive = false; - bool foundDuplicate = false; - uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ : - pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_; + for (uint id = 0; id < sizeof(Gfx9PlusSubDeviceInfo) / sizeof(AMDDeviceInfo); ++id) { + bool foundActive = false; + bool foundDuplicate = false; + uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ + : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_; - if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') { - continue; - } + if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') { + continue; + } - // Loop through all active PAL devices and see if we match one - for (uint i = 0; i < devices.size(); ++i) { - driverVersion = static_cast(devices[i])->info().driverVersion_; - if (driverVersion.find("PAL") != std::string::npos) { - gfxipVersion = devices[i]->settings().useLightning_ ? - pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ : - pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_; - uint gfxIpCurrent = devices[i]->settings().useLightning_ ? - static_cast(devices[i])->hwInfo()->gfxipVersionLC_ : - static_cast(devices[i])->hwInfo()->gfxipVersion_; - if (gfxIpCurrent == gfxipVersion) { - foundActive = true; - break; - } + // Loop through all active PAL devices and see if we match one + for (uint i = 0; i < devices.size(); ++i) { + driverVersion = static_cast(devices[i])->info().driverVersion_; + if (driverVersion.find("PAL") != std::string::npos) { + gfxipVersion = devices[i]->settings().useLightning_ + ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ + : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_; + uint gfxIpCurrent = devices[i]->settings().useLightning_ + ? static_cast(devices[i])->hwInfo()->gfxipVersionLC_ + : static_cast(devices[i])->hwInfo()->gfxipVersion_; + if (gfxIpCurrent == gfxipVersion) { + foundActive = true; + break; } } + } - // Don't report an offline device if it's active - if (foundActive) { - continue; + // Don't report an offline device if it's active + if (foundActive) { + continue; + } + + // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list + // and compare them with the current entry to see if the current entry + // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it + // means the current entry already has been added in the offline device list + for (uint j = 0; j < id; ++j) { + if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') { + continue; } - - // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list - // and compare them with the current entry to see if the current entry - // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it - // means the current entry already has been added in the offline device list - for (uint j = 0; j < id; ++j) { - if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') { - continue; - } - if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_, - pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) { - foundDuplicate = true; - break; - } + if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_, + pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) { + foundDuplicate = true; + break; } + } - // Don't report an offline device twice - if (foundDuplicate) { - continue; - } + // Don't report an offline device twice + if (foundDuplicate) { + continue; + } - Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None; - uint ipLevelMajor = round(gfxipVersion / 100); - uint ipLevelMinor = round(gfxipVersion / 10 % 10); - switch (ipLevelMajor) { + Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None; + uint ipLevelMajor = round(gfxipVersion / 100); + uint ipLevelMinor = round(gfxipVersion / 10 % 10); + switch (ipLevelMajor) { case 9: - ipLevel = Pal::GfxIpLevel::GfxIp9; - break; + ipLevel = Pal::GfxIpLevel::GfxIp9; + break; case 10: switch (ipLevelMinor) { - case 0: - ipLevel = Pal::GfxIpLevel::GfxIp10; - break; - case 1: - ipLevel = Pal::GfxIpLevel::GfxIp10_1; - break; - case 2: - ipLevel = Pal::GfxIpLevel::GfxIp10_2; - break; - case 3: - ipLevel = Pal::GfxIpLevel::GfxIp10_3; - break; + case 0: + ipLevel = Pal::GfxIpLevel::GfxIp10; + break; + case 1: + ipLevel = Pal::GfxIpLevel::GfxIp10_1; + break; + case 2: + ipLevel = Pal::GfxIpLevel::GfxIp10_2; + break; + case 3: + ipLevel = Pal::GfxIpLevel::GfxIp10_3; + break; } - } + } - Pal::AsicRevision revision = Pal::AsicRevision::Unknown; - uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0; + Pal::AsicRevision revision = Pal::AsicRevision::Unknown; + uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0; - switch (gfxipVersion) { + switch (gfxipVersion) { case 901: case 900: - revision = Pal::AsicRevision::Vega10; - break; + revision = Pal::AsicRevision::Vega10; + break; case 903: case 902: - revision = Pal::AsicRevision::Raven; - break; + revision = Pal::AsicRevision::Raven; + break; case 905: case 904: - revision = Pal::AsicRevision::Vega12; - break; + revision = Pal::AsicRevision::Vega12; + break; case 907: case 906: - revision = Pal::AsicRevision::Vega20; - break; + revision = Pal::AsicRevision::Vega20; + break; case 1000: - revision = Pal::AsicRevision::Navi10Lite; - break; + revision = Pal::AsicRevision::Navi10Lite; + break; case 1010: - revision = Pal::AsicRevision::Navi10; - break; + revision = Pal::AsicRevision::Navi10; + break; case 1011: - revision = Pal::AsicRevision::Navi12; - break; + revision = Pal::AsicRevision::Navi12; + break; case 1012: - revision = Pal::AsicRevision::Navi14; - break; + revision = Pal::AsicRevision::Navi14; + break; case 1030: - ShouldNotReachHere(); - break; - } + ShouldNotReachHere(); + break; + } - NullDevice* dev = new NullDevice(); - if (nullptr != dev) { - if (!dev->create(revision, ipLevel, xNACKSupported)) { - delete dev; - } - else { - dev->registerDevice(); - } + NullDevice* dev = new NullDevice(); + if (nullptr != dev) { + if (!dev->create(revision, ipLevel, xNACKSupported)) { + delete dev; + } else { + dev->registerDevice(); } + } } return true; @@ -257,10 +254,10 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel, if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) { hwInfo_ = &DeviceInfo[static_cast(asicRevision)]; } else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) { - subtarget = (static_cast(asicRevision_) % - static_cast(Pal::AsicRevision::Vega10)) - << 1 | xNACKSupported; - hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget]; + subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10)) + << 1 | + xNACKSupported; + hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget]; } else { return false; @@ -271,8 +268,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel, // Report 512MB for all offline devices Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; - heaps[Pal::GpuHeapLocal].heapSize = - heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi; + heaps[Pal::GpuHeapLocal].heapSize = heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi; Pal::WorkStationCaps wscaps = {}; @@ -295,7 +291,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel, info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64; if (settings().useLightning_) { -#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) +#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY) // create compilation object with cache support int gfxipMajor = hwInfo_->gfxipVersionLC_ / 100; int gfxipMinor = hwInfo_->gfxipVersionLC_ / 10 % 10; @@ -323,16 +319,16 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel, cacheCompilation_.reset(compObj); #endif } else { -#if defined(WITH_COMPILER_LIB) +#if defined(WITH_COMPILER_LIB) const char* library = getenv("HSA_COMPILER_LIBRARY"); - aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8), - library, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - AMD_OCL_SC_LIB }; + aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8), + library, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = aclCompilerInit(&opts, &error); @@ -370,9 +366,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.maxWorkItemDimensions_ = 3; - info_.maxComputeUnits_ = settings().enableWgpMode_ ? - palProp.gfxipProperties.shaderCore.numAvailableCus / 2 : - palProp.gfxipProperties.shaderCore.numAvailableCus; + info_.maxComputeUnits_ = settings().enableWgpMode_ + ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2 + : palProp.gfxipProperties.shaderCore.numAvailableCus; info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; @@ -427,7 +423,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, if (GPU_ADD_HBCC_SIZE) { localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize; } else { - localRAM = heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize; + localRAM = + heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize; } info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * @@ -445,10 +442,10 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, // Find the largest heap form FB memory if (GPU_ADD_HBCC_SIZE) { info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize), - cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize)); + cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize)); } else { info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].physicalHeapSize), - cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize)); + cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize)); } #if defined(ATI_OS_WIN) @@ -561,7 +558,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)", - settings().useLightning_ ? ",LC" : ",HSAIL"); + settings().useLightning_ ? ",LC" : ",HSAIL"); info_.profile_ = "FULL_PROFILE"; if (settings().oclVersion_ >= OpenCL20) { @@ -640,15 +637,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray; info_.simdWidth_ = hwInfo()->simdWidth_; info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; - info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32: - palProp.gfxipProperties.shaderCore.nativeWavefrontSize; + info_.wavefrontWidth_ = + settings().enableWave32Mode_ ? 32 : palProp.gfxipProperties.shaderCore.nativeWavefrontSize; info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs; info_.globalMemChannelBanks_ = 4; info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; info_.localMemBanks_ = hwInfo()->localMemBanks_; - info_.gfxipVersion_ = settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_; + info_.gfxipVersion_ = + settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_; info_.timeStampFrequency_ = 1000000; info_.numAsyncQueues_ = numComputeRings; @@ -661,7 +659,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.pcieDeviceId_ = palProp.deviceId; info_.pcieRevisionId_ = palProp.revisionId; info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * hwInfo()->simdPerCU_ * - palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd; + palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd; } } @@ -789,8 +787,7 @@ Device::Device() globalScratchBuf_(nullptr), srdManager_(nullptr), resourceList_(nullptr), - rgpCaptureMgr_(nullptr) - {} + rgpCaptureMgr_(nullptr) {} Device::~Device() { // remove the HW debug manager @@ -803,8 +800,8 @@ Device::~Device() { } if (glb_ctx_ != nullptr) { - glb_ctx_->release(); - glb_ctx_ = nullptr; + glb_ctx_->release(); + glb_ctx_ = nullptr; } delete srdManager_; @@ -878,19 +875,21 @@ bool Device::create(Pal::IDevice* device) { ipLevel_ = properties().gfxLevel; asicRevision_ = properties().revision; - // XNACK flag should be set for PageMigration | IOMMUv2 Support - uint isXNACKSupported = static_cast(properties_.gpuMemoryProperties.flags.pageMigrationEnabled - || properties_.gpuMemoryProperties.flags.iommuv2Support); + // XNACK flag should be set for PageMigration | IOMMUv2 Support + uint isXNACKSupported = + static_cast(properties_.gpuMemoryProperties.flags.pageMigrationEnabled || + properties_.gpuMemoryProperties.flags.iommuv2Support); uint subtarget = isXNACKSupported; // Update HW info for the device if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Polaris12)) { hwInfo_ = &DeviceInfo[static_cast(properties().revision)]; } else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) { - // For compiler sub targets - subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10)) << 1 | - subtarget; - hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget]; + // For compiler sub targets + subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10)) + << 1 | + subtarget; + hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget]; } else { return false; } @@ -995,7 +994,7 @@ bool Device::create(Pal::IDevice* device) { } if (settings().useLightning_) { -#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) +#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY) // create compilation object with cache support int gfxipMajor = hwInfo()->gfxipVersionLC_ / 100; int gfxipMinor = hwInfo()->gfxipVersionLC_ / 10 % 10; @@ -1013,7 +1012,7 @@ bool Device::create(Pal::IDevice* device) { } amd::CacheCompilation* compObj = new amd::CacheCompilation( - cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET); + cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET); if (!compObj) { LogError("Unable to create cache compilation object!"); return false; @@ -1021,18 +1020,17 @@ bool Device::create(Pal::IDevice* device) { cacheCompilation_.reset(compObj); #endif - } - else { -#if defined(WITH_COMPILER_LIB) + } else { +#if defined(WITH_COMPILER_LIB) const char* library = getenv("HSA_COMPILER_LIBRARY"); - aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8), - library, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - AMD_OCL_SC_LIB }; + aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8), + library, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = aclCompilerInit(&opts, &error); @@ -1056,7 +1054,7 @@ bool Device::create(Pal::IDevice* device) { if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) { std::vector devices; - uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true); + uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true); // Add all PAL devices for (uint32_t i = gStartDevice; i < numDevices; ++i) { devices.push_back(amd::Device::devices()[i]); @@ -1070,8 +1068,8 @@ bool Device::create(Pal::IDevice* device) { if (glb_ctx_ == nullptr) { return false; } - amd::Buffer* buf = - new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); + amd::Buffer* buf = + new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); if ((buf != nullptr) && buf->create()) { p2p_stage_ = buf; } else { @@ -1086,11 +1084,8 @@ bool Device::create(Pal::IDevice* device) { // ===================================================================================================================== // Master function that handles developer callbacks from PAL. -void PAL_STDCALL Device::PalDeveloperCallback( - void* pPrivateData, - const Pal::uint32 deviceIndex, - Pal::Developer::CallbackType type, - void* pCbData) { +void PAL_STDCALL Device::PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex, + Pal::Developer::CallbackType type, void* pCbData) { Device* device = static_cast(pPrivateData); const auto& barrier = *static_cast(pCbData); @@ -1099,7 +1094,7 @@ void PAL_STDCALL Device::PalDeveloperCallback( VirtualGPU* gpu = nullptr; if (pBarrierData->pCmdBuffer != nullptr) { // Find which queue the current command buffer belongs - for (const auto& it: device->vgpus()) { + for (const auto& it : device->vgpus()) { if (it->isActiveCmd(pBarrierData->pCmdBuffer)) { gpu = it; break; @@ -1112,18 +1107,18 @@ void PAL_STDCALL Device::PalDeveloperCallback( } switch (type) { - case Pal::Developer::CallbackType::BarrierBegin: - device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier); - break; - case Pal::Developer::CallbackType::BarrierEnd: - device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier); - break; - case Pal::Developer::CallbackType::ImageBarrier: - assert(false); - break; - case Pal::Developer::CallbackType::DrawDispatch: + case Pal::Developer::CallbackType::BarrierBegin: + device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier); break; - default: + case Pal::Developer::CallbackType::BarrierEnd: + device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier); + break; + case Pal::Developer::CallbackType::ImageBarrier: + assert(false); + break; + case Pal::Developer::CallbackType::DrawDispatch: + break; + default: break; } } @@ -1136,15 +1131,16 @@ bool Device::initializeHeapResources() { // Request all compute engines finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines = ((1 << numComputeEngines_) - 1); - for (const auto& it: exclusiveComputeEnginesId_) { + for (const auto& it : exclusiveComputeEnginesId_) { // Request real time compute engines - finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= (1 << it.second); + finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= + (1 << it.second); } // Request all SDMA engines finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1; if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) { - return false; + return false; } heapInitComplete_ = true; @@ -1201,7 +1197,8 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { if (queue != nullptr) { profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); if (queue->asHostQueue() != nullptr) { - bool interopQueue = (0 != (queue->context().info().flags_ & + bool interopQueue = (0 != + (queue->context().info().flags_ & (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr | amd::Context::D3D11DeviceKhr))); rtCUs = queue->rtCUs(); @@ -1233,8 +1230,7 @@ device::Program* Device::createProgram(amd::option::Options* options) { device::Program* program; if (settings().useLightning_) { program = new LightningProgram(*this); - } - else { + } else { program = new HSAILProgram(*this); } if (program == nullptr) { @@ -1249,9 +1245,7 @@ typedef std::unordered_map requestedDevices_t; //! Parses the requested list of devices to be exposed to the user. static void parseRequestedDeviceList(const char* requestedDeviceList, - requestedDevices_t& requestedDevices, - uint32_t numDevices) { - + requestedDevices_t& requestedDevices, uint32_t numDevices) { char* pch = strtok(const_cast(requestedDeviceList), ","); while (pch != nullptr) { bool deviceIdValid = true; @@ -1263,8 +1257,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList, break; } } - if (currentDeviceIndex < 0 || - static_cast(currentDeviceIndex) >= numDevices) { + if (currentDeviceIndex < 0 || static_cast(currentDeviceIndex) >= numDevices) { deviceIdValid = false; } // Get next token. @@ -1310,9 +1303,9 @@ bool Device::init() { // Count up all the devices in the system. platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]); - const char* requestedDeviceList = amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? - HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) - : GPU_DEVICE_ORDINAL; + const char* requestedDeviceList = amd::IS_HIP + ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) + : GPU_DEVICE_ORDINAL; if (requestedDeviceList[0] != '\0') { useDeviceList = true; @@ -1465,8 +1458,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { if (result) { // Disallow permanent map for Win7 only, since OS will move buffer to sysmem if (IS_LINUX || - // Or Win10 - (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) { + // Or Win10 + (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) { void* address = gpuMemory->map(nullptr); CondLog(address == nullptr, "PAL failed lock of persistent memory!"); } @@ -1697,9 +1690,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { (memory->memoryType() != Resource::ExternalPhysical) && ((owner.getHostMem() != nullptr) || ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { - bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) - ? owner.getHostMemRef()->size() - : owner.getSize()); + bool ok = memory->pinSystemMemory( + owner.getHostMem(), + (owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize()); //! \note: Ignore the pinning result for now } @@ -1720,9 +1713,9 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const { assert((owner.asImage() != nullptr) && "View supports images only"); const amd::Image& image = *owner.asImage(); - pal::Memory* gpuImage = new pal::Image( - *this, owner, image.getWidth(), image.getHeight(), image.getDepth(), - image.getImageFormat(), image.getType(), image.getMipLevels()); + pal::Memory* gpuImage = + new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(), + image.getImageFormat(), image.getType(), image.getMipLevels()); // Create resource if (nullptr != gpuImage) { @@ -1827,19 +1820,18 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { Pal::gpusize invisible = allocedMem[Pal::GpuHeapInvisible] - resourceCache().lclCacheSize(); // Fill free memory info - freeMemory[TotalFreeMemory] = static_cast((info().globalMemSize_ - - (local + invisible)) / Ki); + freeMemory[TotalFreeMemory] = + static_cast((info().globalMemSize_ - (local + invisible)) / Ki); if (invisible >= heaps_[Pal::GpuHeapInvisible].heapSize) { invisible = 0; - } - else { + } else { invisible = heaps_[Pal::GpuHeapInvisible].heapSize - invisible; } freeMemory[LargestFreeBlock] = static_cast(invisible) / Ki; if (settings().apuSystem_) { Pal::gpusize sysMem = allocedMem[Pal::GpuHeapGartCacheable] + allocedMem[Pal::GpuHeapGartUswc] - - resourceCache().cacheSize() + resourceCache().lclCacheSize(); + resourceCache().cacheSize() + resourceCache().lclCacheSize(); sysMem /= Ki; if (sysMem >= freeMemory[TotalFreeMemory]) { freeMemory[TotalFreeMemory] = 0; @@ -1945,8 +1937,7 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) { amd::ScopedLock lk(scratchAlloc_); uint sb = vgpu->hwRing(); static const uint WaveSizeLimit = ((1 << 21) - 256); - const uint threadSizeLimit = - WaveSizeLimit / info().wavefrontWidth_; + const uint threadSizeLimit = WaveSizeLimit / info().wavefrontWidth_; if (regNum > threadSizeLimit) { LogError("Requested private memory is bigger than HW supports!"); regNum = threadSizeLimit; @@ -1968,9 +1959,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) { // Calculate the size of the scratch buffer for a queue uint32_t numTotalCUs = info().maxComputeUnits_; uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs; - scratchBuf->size_ = - static_cast(info().wavefrontWidth_) * - scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t); + scratchBuf->size_ = static_cast(info().wavefrontWidth_) * scratchBuf->regNum_ * + numMaxWaves * sizeof(uint32_t); scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_); scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi)); // Note: Generic address space setup in HW requires 64KB alignment for scratch @@ -2280,7 +2270,7 @@ void Device::SrdManager::freeSrdSlot(uint64_t addr) { void Device::updateAllocedMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) const { if (free) { allocedMem[heap] -= size; - } else { + } else { allocedMem[heap] += size; } } @@ -2337,12 +2327,18 @@ cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorag return status; } -bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { +bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, + cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { bool result = false; Pal::SetClockModeInput setClockMode = {}; - Pal::DeviceClockMode palClockMode = static_cast(setClockModeInput.clock_mode); + Pal::DeviceClockMode palClockMode = + static_cast(setClockModeInput.clock_mode); setClockMode.clockMode = palClockMode; - result = (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast(pSetClockModeOutput))))? true : false; + result = (Pal::Result::Success == + (iDev()->SetClockMode(setClockMode, + reinterpret_cast(pSetClockModeOutput)))) + ? true + : false; return result; } diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp index 4528954dc2..5420c8202a 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -49,7 +49,7 @@ class NullDevice : public amd::Device { bool create(Pal::AsicRevision asicRevision, //!< GPU ASIC revision Pal::GfxIpLevel ipLevel, //!< GPU ip level uint xNACKSupported = 0 //!< GPU xNACKSupported - ); + ); //! Instantiate a new virtual device virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) { @@ -111,11 +111,14 @@ class NullDevice : public amd::Device { virtual void svmFree(void* ptr) const { return; } void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); } - void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); } - virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } + void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); } + virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, + cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { + return true; + } protected: - static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL + static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL Pal::AsicRevision asicRevision_; //!< ASIC revision Pal::GfxIpLevel ipLevel_; //!< Device IP level @@ -127,7 +130,7 @@ class NullDevice : public amd::Device { size_t maxTextureSize, //!< Maximum texture size supported in HW uint numComputeRings, //!< Number of compute rings uint numExclusiveComputeRings //!< Number of exclusive compute rings - ); + ); }; //! Forward declarations @@ -148,26 +151,22 @@ class ThreadTrace; #ifndef CL_FILTER_NONE #define CL_FILTER_NONE 0x1142 #endif -enum class ExclusiveQueueType : uint32_t { - RealTime0 = 0, - RealTime1, - Medium -}; +enum class ExclusiveQueueType : uint32_t { RealTime0 = 0, RealTime1, Medium }; class Sampler : public device::Sampler { public: //! Constructor - Sampler(const Device& dev) : dev_(dev) {} + Sampler(const Device& dev) : dev_(dev) {} //! Default destructor for the device memory object virtual ~Sampler(); //! Creates a device sampler from the OCL sampler state bool create(uint32_t oclSamplerState //!< OCL sampler state - ); + ); //! Creates a device sampler from the OCL sampler state bool create(const amd::Sampler& owner //!< AMD sampler object - ); + ); private: //! Disable default copy constructor @@ -216,7 +215,7 @@ class Device : public NullDevice { //! Releases transfer buffer void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer Memory& buffer //!< Transfer buffer for release - ); + ); //! Returns the buffer's size for transfer size_t bufSize() const { return bufSize_; } @@ -308,7 +307,7 @@ class Device : public NullDevice { //! Initialise a device (i.e. all parts of the constructor that could //! potentially fail) bool create(Pal::IDevice* device //!< PAL device interface object - ); + ); //! Destructor for the physical GPU device virtual ~Device(); @@ -346,7 +345,8 @@ class Device : public NullDevice { virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object const device::VirtualDevice* vdev); - virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput); + virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, + cl_set_device_clock_mode_output_amd* pSetClockModeOutput); //! Retrieves information about free memory on a GPU device virtual bool globalFreeMemory(size_t* freeMemory) const; @@ -398,9 +398,10 @@ class Device : public NullDevice { //! Returns the number of available compute rings uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size(); } - //! Returns the map of available exclusive compute rings with the engine index - const std::map& exclusiveComputeEnginesId() const - { return exclusiveComputeEnginesId_; } + //! Returns the map of available exclusive compute rings with the engine index + const std::map& exclusiveComputeEnginesId() const { + return exclusiveComputeEnginesId_; + } //! Returns the number of available DMA engines uint numDMAEngines() const { return numDmaEngines_; } @@ -526,11 +527,8 @@ class Device : public NullDevice { } private: - static void PAL_STDCALL PalDeveloperCallback( - void* pPrivateData, - const Pal::uint32 deviceIndex, - Pal::Developer::CallbackType type, - void* pCbData); + static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex, + Pal::Developer::CallbackType type, void* pCbData); //! Disable copy constructor Device(const Device&); @@ -554,36 +552,37 @@ class Device : public NullDevice { //! Allocates/reallocates the scratch buffer, according to the usage bool allocScratch(uint regNum, //!< Number of the scratch registers const VirtualGPU* vgpu //!< Virtual GPU for the allocation - ); + ); //! Interop for D3D devices bool associateD3D11Device(void* d3d11Device //!< void* is of type ID3D11Device* - ); + ); bool associateD3D10Device(void* d3d10Device //!< void* is of type ID3D10Device* - ); + ); bool associateD3D9Device(void* d3d9Device //!< void* is of type IDirect3DDevice9* - ); + ); //! Interop for GL device bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const; bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const; - static char* platformObj_; //!< Memory allocated for PAL platform object - static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object + static char* platformObj_; //!< Memory allocated for PAL platform object + static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object - amd::Context* context_; //!< A dummy context for internal allocations - mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device + amd::Context* context_; //!< A dummy context for internal allocations + mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device //! Lock to serialise all async ops on initialization heap operation - mutable amd::Monitor lockForInitHeap_; - mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access - mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access - mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation - mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources - mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access - XferBuffers* xferRead_; //!< Transfer buffers read - std::vector* mapCache_; //!< Map cache info structure - ResourceCache* resourceCache_; //!< Resource cache - uint numComputeEngines_; //!< The number of available compute engines - std::map exclusiveComputeEnginesId_;//!< The number of available compute engines + mutable amd::Monitor lockForInitHeap_; + mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access + mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access + mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation + mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources + mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access + XferBuffers* xferRead_; //!< Transfer buffers read + std::vector* mapCache_; //!< Map cache info structure + ResourceCache* resourceCache_; //!< Resource cache + uint numComputeEngines_; //!< The number of available compute engines + std::map + exclusiveComputeEnginesId_; //!< The number of available compute engines uint numDmaEngines_; //!< The number of available compute engines bool heapInitComplete_; //!< Keep track of initialization status of heap resources VirtualGPU* xferQueue_; //!< Transfer queue @@ -594,10 +593,13 @@ class Device : public NullDevice { mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem Pal::DeviceProperties properties_; //!< PAL device properties Pal::IDevice* device_; //!< PAL device object - mutable std::atomic allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter - std::unordered_set* resourceList_; //!< Active resource list - RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager - Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL + mutable std::atomic + allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter + std::unordered_set* resourceList_; //!< Active resource list + RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager + Pal::GpuMemoryHeapProperties + heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp index e7d31a9d86..202fca7ef6 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp @@ -3,19 +3,19 @@ #if defined(ATI_OS_LINUX) namespace pal { bool Device::associateD3D10Device(void* d3d10Device) { return false; } -} // pal +} // namespace pal #else // !ATI_OS_WIN #include /************************************************************************************************************** -* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce -*tree. -* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface -*classes. -**************************************************************************************************************/ + * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. + * This means OCL client spec will need to change to include headers directly from the DXX perforce + *tree. + * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change + * without notification. So it is safe to use a local copy of the relevant DXX extension interface + *classes. + **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { @@ -127,6 +127,6 @@ bool Device::associateD3D10Device(void* d3d10Device) { return canInteroperate; } -} // pal +} // namespace pal #endif // !ATI_OS_WIN diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp index 025b8ed9a5..00d852d80e 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp @@ -3,19 +3,19 @@ #if defined(ATI_OS_LINUX) namespace pal { bool Device::associateD3D11Device(void* d3d11Device) { return false; } -} +} // namespace pal #else // !ATI_OS_LINUX #include /************************************************************************************************************** -* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce -*tree. -* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface -*classes. -**************************************************************************************************************/ + * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. + * This means OCL client spec will need to change to include headers directly from the DXX perforce + *tree. + * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change + * without notification. So it is safe to use a local copy of the relevant DXX extension interface + *classes. + **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { @@ -128,6 +128,6 @@ bool Device::associateD3D11Device(void* d3d11Device) { return canInteroperate; } -} // pal +} // namespace pal #endif // !ATI_OS_LINUX diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp index a589d2abcf..cf2ee5303c 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp @@ -3,20 +3,20 @@ #if defined(ATI_OS_LINUX) namespace pal { bool Device::associateD3D9Device(void* d3dDevice) { return false; } -} +} // namespace pal #else // !ATI_OS_LINUX #include #include /************************************************************************************************************** -* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce -*tree. -* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface -*classes. -**************************************************************************************************************/ + * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. + * This means OCL client spec will need to change to include headers directly from the DXX perforce + *tree. + * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change + * without notification. So it is safe to use a local copy of the relevant DXX extension interface + *classes. + **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { @@ -44,5 +44,5 @@ bool Device::associateD3D9Device(void* d3d9Device) { return canInteroperate; } -} // pal +} // namespace pal #endif // !ATI_OS_WIN diff --git a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp index ac209191ca..1d8e9df9e7 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp @@ -45,8 +45,8 @@ typedef struct _mesa_glinterop_device_info { #ifdef ATI_OS_LINUX typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName); static PFNGlxGetProcAddress pfnGlxGetProcAddress = nullptr; -typedef int(APIENTRYP PFNMesaGLInteropGLXQueryDeviceInfo)( - Display* dpy, GLXContext context, mesa_glinterop_device_info* out); +typedef int(APIENTRYP PFNMesaGLInteropGLXQueryDeviceInfo)(Display* dpy, GLXContext context, + mesa_glinterop_device_info* out); static PFNMesaGLInteropGLXQueryDeviceInfo pfnMesaGLInteropGLXQueryDeviceInfo = nullptr; static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = nullptr; static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = nullptr; @@ -68,480 +68,579 @@ static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = nullptr; namespace pal { // -/// GSL Surface Formats as per defined in cmSurfFmtEnum enum in //depot/stg/ugl/drivers/ugl/src/include/cm_enum.h +/// GSL Surface Formats as per defined in cmSurfFmtEnum enum in +/// //depot/stg/ugl/drivers/ugl/src/include/cm_enum.h // typedef enum cmSurfFmtEnum { - CM_SURF_FMT_NOOVERRIDE = -1, - CM_SURF_FMT_LUMINANCE8, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) - CM_SURF_FMT_LUMINANCE16, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE16F, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE32F, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_INTENSITY8, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) - CM_SURF_FMT_INTENSITY16, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) - CM_SURF_FMT_INTENSITY16F, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) - CM_SURF_FMT_INTENSITY32F, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) - CM_SURF_FMT_ALPHA8, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) - CM_SURF_FMT_ALPHA16, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) - CM_SURF_FMT_ALPHA16F, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) - CM_SURF_FMT_ALPHA32F, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) - CM_SURF_FMT_LUMINANCE8_ALPHA8, ///< Luminance Alpha, 16 bits per element packed as (@c AAAAAAAALLLLLLLL) - CM_SURF_FMT_LUMINANCE16_ALPHA16, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE16F_ALPHA16F, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE32F_ALPHA32F, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_B2_G3_R3, ///< RGB, 8 bits per element packed as (@c RRRGGGBB) - CM_SURF_FMT_B5_G6_R5, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB) - CM_SURF_FMT_BGRX4, ///< RGB, 16 bits per element packed as (@c XXXXRRRRGGGGBBBB) - CM_SURF_FMT_BGR5_X1, ///< RGB, 16 bits per element packed as (@c XRRRRRGGGGGBBBBB) - CM_SURF_FMT_BGRX8, ///< RGB, 32 bits per element packed as (@c XXXXXXXXRRRRRRRRGGGGGGGGBBBBBBBB) - XXX unused by current driver - CM_SURF_FMT_BGR10_X2, ///< RGB, 32 bits per element packed as (@c XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) - CM_SURF_FMT_BGRX16, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) - CM_SURF_FMT_BGRX16F, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) - CM_SURF_FMT_BGRX32F, ///< RGB, 128 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) - CM_SURF_FMT_RGBX4, ///< RGB, 16 bits per element packed as (@c XXXXBBBBGGGGRRRR) - CM_SURF_FMT_RGB5_X1, ///< RGB, 16 bits per element packed as (@c XBBBBBGGGGGRRRRR) - CM_SURF_FMT_RGBX8, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGB10_X2, ///< RGB, 32 bits per element packed as (@c XXBBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) - CM_SURF_FMT_RGBX16, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBX16F, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBX32F, ///< RGB, 128 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_BGRA4, ///< RGBA, 16 bits per element packed as (@c AAAARRRRGGGGBBBB) - CM_SURF_FMT_BGR5_A1, ///< RGBA, 16 bits per element packed as (@c ARRRRRGGGGGBBBBB) - CM_SURF_FMT_BGRA8, ///< RGBA, 32 bits per element packed as (@c AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB) - CM_SURF_FMT_BGR10_A2, ///< RGBA, 32 bits per element packed as (@c AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) - CM_SURF_FMT_BGRA16, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) - CM_SURF_FMT_BGRA16F, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) - CM_SURF_FMT_BGRA32F, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) - CM_SURF_FMT_RGBA4, ///< RGBA, 16 bits per element packed as (@c AAAABBBBGGGGRRRR) - CM_SURF_FMT_RGB5_A1, ///< RGBA, 16 bits per element packed as (@c ABBBBBGGGGGRRRRR) - CM_SURF_FMT_RGBA8, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGB10_A2, ///< RGBA, 32 bits per element packed as (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) - CM_SURF_FMT_RGBA16, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBA16F, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBA32I, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBA32F, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_DUDV8, ///< DUDV 16 bits per element packed as (@c VVVVVVVVUUUUUUUU) - CM_SURF_FMT_DXT1, ///< compressed, DXT1 - CM_SURF_FMT_DXT2_3, ///< compressed, DXT2_3 - CM_SURF_FMT_DXT4_5, ///< compressed, DXT4_5 - CM_SURF_FMT_ATI1N, ///< compressed, 1 component - CM_SURF_FMT_ATI2N, ///< compressed, 2 component - CM_SURF_FMT_DEPTH16, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH16F, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH24_X8, ///< depth, 32 bits per element packed as (@c XXXXXXXXDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH24F_X8, ///< depth, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH24_STEN8, ///< depth + stencil, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH24F_STEN8, ///< depth + stencil, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH32F_X24_STEN8, ///< depth + stencil, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH32F, ///< depth, 32 bits per element packed as (@c DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_sR11_sG11_sB10, ///< RGB, 32 bits per element packed as (@c RRRRRRRRRRRGGGGGGGGGGGBBBBBBBBBB) - CM_SURF_FMT_sU16, ///< - CM_SURF_FMT_sUV16, ///< - CM_SURF_FMT_sUVWQ16, ///< - CM_SURF_FMT_RG16, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) - CM_SURF_FMT_RG16F, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) - CM_SURF_FMT_RG32F, ///< RG, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) - CM_SURF_FMT_ABGR4, ///< RGBA, 16 bits per element packed as (@c RRRRGGGGBBBBAAAA) - CM_SURF_FMT_A1_BGR5, ///< RGBA, 16 bits per element packed as (@c RRRRRGGGGGBBBBBA) - CM_SURF_FMT_ABGR8, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA) - CM_SURF_FMT_A2_BGR10, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA) - CM_SURF_FMT_ABGR16, ///< RGBA, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA) - CM_SURF_FMT_ABGR16F, ///< RGBA, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA) - CM_SURF_FMT_ABGR32F, ///< RGBA, 128 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) - CM_SURF_FMT_DXT1A, - CM_SURF_FMT_sRGB10_A2, ///< RGBA, 32 bits per element packed as signed (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) - CM_SURF_FMT_sR8, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) - CM_SURF_FMT_sRG8, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) - CM_SURF_FMT_sR32I, ///< R, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_sRG32I, ///< RG, 64 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) - CM_SURF_FMT_sRGBA32I, ///< RGBA, 128 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) - CM_SURF_FMT_R32I, ///< R, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RG32I, ///< RG, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) - CM_SURF_FMT_RG8, ///< RG8, 16 bits per element packed as (@c RRRRRRRRGGGGGGGG) - CM_SURF_FMT_sRGBA8, ///< RGBA8, 32 bits per element packed as signed (@c RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA) - CM_SURF_FMT_R11F_G11F_B10F, ///< RGB, 32 bits per element packed as (@c BBBBBBBBBBGGGGGGGGGGGRRRRRRRRRRR) - CM_SURF_FMT_RGB9_E5, ///< RGB, 32 bits per element packed as (@c EEEEEBBBBBBBBBGGGGGGGGGRRRRRRRRR) - CM_SURF_FMT_LUMINANCE_LATC1, ///< compressed LATC1 - CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, ///< compressed signed LATC1 - CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, ///< compressed LATC2 - CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, ///< compressed signed LATC2 - CM_SURF_FMT_RED_RGTC1, ///< compressed RGTC1 - CM_SURF_FMT_SIGNED_RED_RGTC1, ///< compressed signed RGTC1 - CM_SURF_FMT_RED_GREEN_RGTC2, ///< compressed RGTC2 - CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, ///< compressed signed RGTC2 - CM_SURF_FMT_R8, ///< R, 8 bits per element packed (@c RRRRRRRR) - CM_SURF_FMT_R16, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) - CM_SURF_FMT_R16F, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) - CM_SURF_FMT_R32F, ///< R, 32 bits per element packed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_R8I, ///< R, 8 bits per element packed (@c RRRRRRRR) - CM_SURF_FMT_sR8I, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) - CM_SURF_FMT_RG8I, ///< RG, 16 bits per element packed (@c RRRRRRRRGGGGGGGG) - CM_SURF_FMT_sRG8I, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) - CM_SURF_FMT_R16I, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) - CM_SURF_FMT_sR16I, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR) - CM_SURF_FMT_RG16I, ///< RG, 32 bits per element packed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) - CM_SURF_FMT_sRG16I, ///< RG, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) - CM_SURF_FMT_RGBA32UI, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) - CM_SURF_FMT_RGBX32UI, ///< RGBX, 128 bits per element packed as(@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) - CM_SURF_FMT_ALPHA32UI, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) - CM_SURF_FMT_INTENSITY32UI, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) - CM_SURF_FMT_LUMINANCE32UI, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE_ALPHA32UI, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_RGBA16UI, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBX16UI, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_ALPHA16UI, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) - CM_SURF_FMT_INTENSITY16UI, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) - CM_SURF_FMT_LUMINANCE16UI, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) - CM_SURF_FMT_LUMINANCE_ALPHA16UI, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) - CM_SURF_FMT_RGBA8UI, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGBX8UI, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_ALPHA8UI, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) - CM_SURF_FMT_INTENSITY8UI, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) - CM_SURF_FMT_LUMINANCE8UI, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) - CM_SURF_FMT_LUMINANCE_ALPHA8UI, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAALLLLLLLL) - CM_SURF_FMT_sRGBX32I, ///< RGBX, 128 bits per element packed as(@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) - CM_SURF_FMT_sALPHA32I, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) - CM_SURF_FMT_sINTENSITY32I, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) - CM_SURF_FMT_sLUMINANCE32I, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_sLUMINANCE_ALPHA32I, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) - CM_SURF_FMT_sRGBA16I, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_sRGBX16I, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_sALPHA16I, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) - CM_SURF_FMT_sINTENSITY16I, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) - CM_SURF_FMT_sLUMINANCE16I, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) - CM_SURF_FMT_sLUMINANCE_ALPHA16I, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) - CM_SURF_FMT_sRGBA8I, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_sRGBX8I, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_sALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) - CM_SURF_FMT_sINTENSITY8I, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) - CM_SURF_FMT_sLUMINANCE8I, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) - CM_SURF_FMT_sLUMINANCE_ALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) - CM_SURF_FMT_sDXT6, ///< compressed, CM_SURF_FMT_sDXT6 - CM_SURF_FMT_DXT6, ///< compressed, CM_SURF_FMT_DXT6 - CM_SURF_FMT_DXT7, ///< compressed, DXT7 - CM_SURF_FMT_LUMINANCE8_SNORM, ///< Luminance, 8 bits per element packed as signed (@c LLLLLLLL) - CM_SURF_FMT_LUMINANCE16_SNORM, ///< Luminance, 16 bits per element packed as signed (@c LLLLLLLLLLLLLLLL) - CM_SURF_FMT_INTENSITY8_SNORM, ///< Intensity, 8 bits per element packed as signed (@c IIIIIIII) - CM_SURF_FMT_INTENSITY16_SNORM, ///< Intensity, 16 bits per element packed as signed (@c IIIIIIIIIIIIIIII) - CM_SURF_FMT_ALPHA8_SNORM, ///< Alpha, 8 bits per element packed as signed (@c AAAAAAAA) - CM_SURF_FMT_ALPHA16_SNORM, ///< Alpha, 16 bits per element packed as signed (@c AAAAAAAAAAAAAAAA) - CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, ///< Luminance Alpha, 16 bits per element packed as signed (@c AAAAAAAALLLLLLLL) - CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, ///< Luminance Alpha, 32 bits per element packed as signed (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) - CM_SURF_FMT_R8_SNORM, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) - CM_SURF_FMT_R16_SNORM, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR) - CM_SURF_FMT_RG8_SNORM, ///< RG8, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) - CM_SURF_FMT_RG16_SNORM, ///< RG, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) - CM_SURF_FMT_RGBX8_SNORM, ///< RGB, 32 bits per element packed as signed (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGBX16_SNORM, ///< RGB, 64 bits per element packed as signed (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBA8_SNORM, ///< RGBA, 32 bits per element packed as signed (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGBA16_SNORM, ///< RGBA, 64 bits per element packed as signed (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGB10_A2UI, ///< RGBA, 32 bits per element packed as (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) - CM_SURF_FMT_RGB32F, ///< RGB, float, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGB32I, ///< RGB, unnormalized int, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGB32UI, ///< RGB, unnormalized uint, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) - CM_SURF_FMT_RGBX8_SRGB, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_RGBA8_SRGB, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) - CM_SURF_FMT_DXT1_SRGB, ///< compressed, DXT1 - CM_SURF_FMT_DXT1A_SRGB, ///< - CM_SURF_FMT_DXT2_3_SRGB, ///< compressed, DXT2_3 - CM_SURF_FMT_DXT4_5_SRGB, ///< compressed, DXT4_5 - CM_SURF_FMT_DXT7_SRGB, ///< compressed, DXT7 - CM_SURF_FMT_RGB8_ETC2, ///< ETC2 compressed, RGB8 in 64 bits - CM_SURF_FMT_SRGB8_ETC2, ///< ETC2 compressed, SRGB8 in 64 bits - CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, RGB8 in 64 bits - CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, sRGB8A1 in 64 bits - CM_SURF_FMT_RGBA8_ETC2_EAC, ///< ETC2 compressed, RGBA8 in 128 bits - CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, ///< ETC2 compressed, sRGBA8 in 128 bits - CM_SURF_FMT_R11_EAC, ///< EAC compressed, R11 in 64 bits - CM_SURF_FMT_SIGNED_R11_EAC, ///< EAC compressed, signed R11 in 64 bits - CM_SURF_FMT_RG11_EAC, ///< EAC compressed, RG11 in 128 bits - CM_SURF_FMT_SIGNED_RG11_EAC, ///< EAC compressed, signed RG11 in 128 bits + CM_SURF_FMT_NOOVERRIDE = -1, + CM_SURF_FMT_LUMINANCE8, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) + CM_SURF_FMT_LUMINANCE16, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE16F, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE32F, ///< Luminance, 32 bits per element packed as (@c + ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_INTENSITY8, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) + CM_SURF_FMT_INTENSITY16, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) + CM_SURF_FMT_INTENSITY16F, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) + CM_SURF_FMT_INTENSITY32F, ///< Intensity, 32 bits per element packed as (@c + ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) + CM_SURF_FMT_ALPHA8, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) + CM_SURF_FMT_ALPHA16, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) + CM_SURF_FMT_ALPHA16F, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) + CM_SURF_FMT_ALPHA32F, ///< Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) + CM_SURF_FMT_LUMINANCE8_ALPHA8, ///< Luminance Alpha, 16 bits per element packed as (@c + ///< AAAAAAAALLLLLLLL) + CM_SURF_FMT_LUMINANCE16_ALPHA16, ///< Luminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE16F_ALPHA16F, ///< Luminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE32F_ALPHA32F, ///< Luminance Alpha, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_B2_G3_R3, ///< RGB, 8 bits per element packed as (@c RRRGGGBB) + CM_SURF_FMT_B5_G6_R5, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB) + CM_SURF_FMT_BGRX4, ///< RGB, 16 bits per element packed as (@c XXXXRRRRGGGGBBBB) + CM_SURF_FMT_BGR5_X1, ///< RGB, 16 bits per element packed as (@c XRRRRRGGGGGBBBBB) + CM_SURF_FMT_BGRX8, ///< RGB, 32 bits per element packed as (@c + ///< XXXXXXXXRRRRRRRRGGGGGGGGBBBBBBBB) - XXX unused by current driver + CM_SURF_FMT_BGR10_X2, ///< RGB, 32 bits per element packed as (@c + ///< XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) + CM_SURF_FMT_BGRX16, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) + CM_SURF_FMT_BGRX16F, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) + CM_SURF_FMT_BGRX32F, ///< RGB, 128 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) + CM_SURF_FMT_RGBX4, ///< RGB, 16 bits per element packed as (@c XXXXBBBBGGGGRRRR) + CM_SURF_FMT_RGB5_X1, ///< RGB, 16 bits per element packed as (@c XBBBBBGGGGGRRRRR) + CM_SURF_FMT_RGBX8, ///< RGB, 32 bits per element packed as (@c + ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGB10_X2, ///< RGB, 32 bits per element packed as (@c + ///< XXBBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) + CM_SURF_FMT_RGBX16, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBX16F, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBX32F, ///< RGB, 128 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_BGRA4, ///< RGBA, 16 bits per element packed as (@c AAAARRRRGGGGBBBB) + CM_SURF_FMT_BGR5_A1, ///< RGBA, 16 bits per element packed as (@c ARRRRRGGGGGBBBBB) + CM_SURF_FMT_BGRA8, ///< RGBA, 32 bits per element packed as (@c + ///< AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB) + CM_SURF_FMT_BGR10_A2, ///< RGBA, 32 bits per element packed as (@c + ///< AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) + CM_SURF_FMT_BGRA16, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) + CM_SURF_FMT_BGRA16F, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB) + CM_SURF_FMT_BGRA32F, ///< RGBA, 128 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) + CM_SURF_FMT_RGBA4, ///< RGBA, 16 bits per element packed as (@c AAAABBBBGGGGRRRR) + CM_SURF_FMT_RGB5_A1, ///< RGBA, 16 bits per element packed as (@c ABBBBBGGGGGRRRRR) + CM_SURF_FMT_RGBA8, ///< RGBA, 32 bits per element packed as (@c + ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGB10_A2, ///< RGBA, 32 bits per element packed as (@c + ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) + CM_SURF_FMT_RGBA16, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBA16F, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBA32I, ///< RGBA, 128 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBA32F, ///< RGBA, 128 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_DUDV8, ///< DUDV 16 bits per element packed as (@c VVVVVVVVUUUUUUUU) + CM_SURF_FMT_DXT1, ///< compressed, DXT1 + CM_SURF_FMT_DXT2_3, ///< compressed, DXT2_3 + CM_SURF_FMT_DXT4_5, ///< compressed, DXT4_5 + CM_SURF_FMT_ATI1N, ///< compressed, 1 component + CM_SURF_FMT_ATI2N, ///< compressed, 2 component + CM_SURF_FMT_DEPTH16, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH16F, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH24_X8, ///< depth, 32 bits per element packed as (@c + ///< XXXXXXXXDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH24F_X8, ///< depth, 32 bits per element packed as (@c + ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH24_STEN8, ///< depth + stencil, 32 bits per element packed as (@c + ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH24F_STEN8, ///< depth + stencil, 32 bits per element packed as (@c + ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH32F_X24_STEN8, ///< depth + stencil, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH32F, ///< depth, 32 bits per element packed as (@c + ///< DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_sR11_sG11_sB10, ///< RGB, 32 bits per element packed as (@c + ///< RRRRRRRRRRRGGGGGGGGGGGBBBBBBBBBB) + CM_SURF_FMT_sU16, ///< + CM_SURF_FMT_sUV16, ///< + CM_SURF_FMT_sUVWQ16, ///< + CM_SURF_FMT_RG16, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) + CM_SURF_FMT_RG16F, ///< RG, 32 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) + CM_SURF_FMT_RG32F, ///< RG, 64 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) + CM_SURF_FMT_ABGR4, ///< RGBA, 16 bits per element packed as (@c RRRRGGGGBBBBAAAA) + CM_SURF_FMT_A1_BGR5, ///< RGBA, 16 bits per element packed as (@c RRRRRGGGGGBBBBBA) + CM_SURF_FMT_ABGR8, ///< RGBA, 32 bits per element packed as (@c + ///< RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA) + CM_SURF_FMT_A2_BGR10, ///< RGBA, 32 bits per element packed as (@c + ///< RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA) + CM_SURF_FMT_ABGR16, ///< RGBA, 64 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA) + CM_SURF_FMT_ABGR16F, ///< RGBA, 64 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA) + CM_SURF_FMT_ABGR32F, ///< RGBA, 128 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) + CM_SURF_FMT_DXT1A, + CM_SURF_FMT_sRGB10_A2, ///< RGBA, 32 bits per element packed as signed (@c + ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) + CM_SURF_FMT_sR8, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) + CM_SURF_FMT_sRG8, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) + CM_SURF_FMT_sR32I, ///< R, 32 bits per element packed as signed (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_sRG32I, ///< RG, 64 bits per element packed as signed (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) + CM_SURF_FMT_sRGBA32I, ///< RGBA, 128 bits per element packed as signed (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) + CM_SURF_FMT_R32I, ///< R, 32 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RG32I, ///< RG, 64 bits per element packed as (@c + ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG) + CM_SURF_FMT_RG8, ///< RG8, 16 bits per element packed as (@c RRRRRRRRGGGGGGGG) + CM_SURF_FMT_sRGBA8, ///< RGBA8, 32 bits per element packed as signed (@c + ///< RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA) + CM_SURF_FMT_R11F_G11F_B10F, ///< RGB, 32 bits per element packed as (@c + ///< BBBBBBBBBBGGGGGGGGGGGRRRRRRRRRRR) + CM_SURF_FMT_RGB9_E5, ///< RGB, 32 bits per element packed as (@c + ///< EEEEEBBBBBBBBBGGGGGGGGGRRRRRRRRR) + CM_SURF_FMT_LUMINANCE_LATC1, ///< compressed LATC1 + CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, ///< compressed signed LATC1 + CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, ///< compressed LATC2 + CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, ///< compressed signed LATC2 + CM_SURF_FMT_RED_RGTC1, ///< compressed RGTC1 + CM_SURF_FMT_SIGNED_RED_RGTC1, ///< compressed signed RGTC1 + CM_SURF_FMT_RED_GREEN_RGTC2, ///< compressed RGTC2 + CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, ///< compressed signed RGTC2 + CM_SURF_FMT_R8, ///< R, 8 bits per element packed (@c RRRRRRRR) + CM_SURF_FMT_R16, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) + CM_SURF_FMT_R16F, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) + CM_SURF_FMT_R32F, ///< R, 32 bits per element packed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_R8I, ///< R, 8 bits per element packed (@c RRRRRRRR) + CM_SURF_FMT_sR8I, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) + CM_SURF_FMT_RG8I, ///< RG, 16 bits per element packed (@c RRRRRRRRGGGGGGGG) + CM_SURF_FMT_sRG8I, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) + CM_SURF_FMT_R16I, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR) + CM_SURF_FMT_sR16I, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR) + CM_SURF_FMT_RG16I, ///< RG, 32 bits per element packed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) + CM_SURF_FMT_sRG16I, ///< RG, 32 bits per element packed as signed (@c + ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) + CM_SURF_FMT_RGBA32UI, ///< RGBA, 128 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) + CM_SURF_FMT_RGBX32UI, ///< RGBX, 128 bits per element packed as(@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) + CM_SURF_FMT_ALPHA32UI, ///< Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) + CM_SURF_FMT_INTENSITY32UI, ///< Intensity, 32 bits per element packed as (@c + ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) + CM_SURF_FMT_LUMINANCE32UI, ///< Luminance, 32 bits per element packed as (@c + ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE_ALPHA32UI, ///< Luminance Alpha, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_RGBA16UI, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBX16UI, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_ALPHA16UI, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) + CM_SURF_FMT_INTENSITY16UI, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) + CM_SURF_FMT_LUMINANCE16UI, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) + CM_SURF_FMT_LUMINANCE_ALPHA16UI, ///< Luminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) + CM_SURF_FMT_RGBA8UI, ///< RGBA, 32 bits per element packed as (@c + ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGBX8UI, ///< RGB, 32 bits per element packed as (@c + ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_ALPHA8UI, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) + CM_SURF_FMT_INTENSITY8UI, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) + CM_SURF_FMT_LUMINANCE8UI, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) + CM_SURF_FMT_LUMINANCE_ALPHA8UI, ///< Luminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAALLLLLLLL) + CM_SURF_FMT_sRGBX32I, ///< RGBX, 128 bits per element packed as(@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB) + CM_SURF_FMT_sALPHA32I, ///< Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA) + CM_SURF_FMT_sINTENSITY32I, ///< Intensity, 32 bits per element packed as (@c + ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII) + CM_SURF_FMT_sLUMINANCE32I, ///< Luminance, 32 bits per element packed as (@c + ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_sLUMINANCE_ALPHA32I, ///< Luminance Alpha, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL) + CM_SURF_FMT_sRGBA16I, ///< RGBA, 64 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_sRGBX16I, ///< RGB, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_sALPHA16I, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA) + CM_SURF_FMT_sINTENSITY16I, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII) + CM_SURF_FMT_sLUMINANCE16I, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL) + CM_SURF_FMT_sLUMINANCE_ALPHA16I, ///< Luminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) + CM_SURF_FMT_sRGBA8I, ///< RGBA, 32 bits per element packed as (@c + ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_sRGBX8I, ///< RGB, 32 bits per element packed as (@c + ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_sALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) + CM_SURF_FMT_sINTENSITY8I, ///< Intensity, 8 bits per element packed as (@c IIIIIIII) + CM_SURF_FMT_sLUMINANCE8I, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL) + CM_SURF_FMT_sLUMINANCE_ALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA) + CM_SURF_FMT_sDXT6, ///< compressed, CM_SURF_FMT_sDXT6 + CM_SURF_FMT_DXT6, ///< compressed, CM_SURF_FMT_DXT6 + CM_SURF_FMT_DXT7, ///< compressed, DXT7 + CM_SURF_FMT_LUMINANCE8_SNORM, ///< Luminance, 8 bits per element packed as signed (@c LLLLLLLL) + CM_SURF_FMT_LUMINANCE16_SNORM, ///< Luminance, 16 bits per element packed as signed (@c + ///< LLLLLLLLLLLLLLLL) + CM_SURF_FMT_INTENSITY8_SNORM, ///< Intensity, 8 bits per element packed as signed (@c IIIIIIII) + CM_SURF_FMT_INTENSITY16_SNORM, ///< Intensity, 16 bits per element packed as signed (@c + ///< IIIIIIIIIIIIIIII) + CM_SURF_FMT_ALPHA8_SNORM, ///< Alpha, 8 bits per element packed as signed (@c AAAAAAAA) + CM_SURF_FMT_ALPHA16_SNORM, ///< Alpha, 16 bits per element packed as signed (@c + ///< AAAAAAAAAAAAAAAA) + CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, ///< Luminance Alpha, 16 bits per element packed as signed + ///< (@c AAAAAAAALLLLLLLL) + CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, ///< Luminance Alpha, 32 bits per element packed as signed + ///< (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL) + CM_SURF_FMT_R8_SNORM, ///< R, 8 bits per element packed as signed (@c RRRRRRRR) + CM_SURF_FMT_R16_SNORM, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR) + CM_SURF_FMT_RG8_SNORM, ///< RG8, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG) + CM_SURF_FMT_RG16_SNORM, ///< RG, 32 bits per element packed as signed (@c + ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG) + CM_SURF_FMT_RGBX8_SNORM, ///< RGB, 32 bits per element packed as signed (@c + ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGBX16_SNORM, ///< RGB, 64 bits per element packed as signed (@c + ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBA8_SNORM, ///< RGBA, 32 bits per element packed as signed (@c + ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGBA16_SNORM, ///< RGBA, 64 bits per element packed as signed (@c + ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGB10_A2UI, ///< RGBA, 32 bits per element packed as (@c + ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR) + CM_SURF_FMT_RGB32F, ///< RGB, float, 96 bits per element packed as (@c + ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGB32I, ///< RGB, unnormalized int, 96 bits per element packed as (@c + ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGB32UI, ///< RGB, unnormalized uint, 96 bits per element packed as (@c + ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR) + CM_SURF_FMT_RGBX8_SRGB, ///< RGB, 32 bits per element packed as (@c + ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_RGBA8_SRGB, ///< RGBA, 32 bits per element packed as (@c + ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR) + CM_SURF_FMT_DXT1_SRGB, ///< compressed, DXT1 + CM_SURF_FMT_DXT1A_SRGB, ///< + CM_SURF_FMT_DXT2_3_SRGB, ///< compressed, DXT2_3 + CM_SURF_FMT_DXT4_5_SRGB, ///< compressed, DXT4_5 + CM_SURF_FMT_DXT7_SRGB, ///< compressed, DXT7 + CM_SURF_FMT_RGB8_ETC2, ///< ETC2 compressed, RGB8 in 64 bits + CM_SURF_FMT_SRGB8_ETC2, ///< ETC2 compressed, SRGB8 in 64 bits + CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, RGB8 in 64 bits + CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, sRGB8A1 in 64 bits + CM_SURF_FMT_RGBA8_ETC2_EAC, ///< ETC2 compressed, RGBA8 in 128 bits + CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, ///< ETC2 compressed, sRGBA8 in 128 bits + CM_SURF_FMT_R11_EAC, ///< EAC compressed, R11 in 64 bits + CM_SURF_FMT_SIGNED_R11_EAC, ///< EAC compressed, signed R11 in 64 bits + CM_SURF_FMT_RG11_EAC, ///< EAC compressed, RG11 in 128 bits + CM_SURF_FMT_SIGNED_RG11_EAC, ///< EAC compressed, signed RG11 in 128 bits - CM_SURF_FMT_RGBA8_ASTC_4x4, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_5x4, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_5x5, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_6x5, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_6x6, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_8x5, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_8x6, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_8x8, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_10x5, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_10x6, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_10x8, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_10x10, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_12x10, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_RGBA8_ASTC_12x12, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_4x4, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_5x4, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_5x5, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_6x5, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_6x6, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_8x5, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_8x6, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_8x8, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_10x5, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_10x6, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_10x8, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_10x10, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_12x10, ///< ASTC compressed RGBA8 in 128 bits block + CM_SURF_FMT_RGBA8_ASTC_12x12, ///< ASTC compressed RGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_4x4, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_5x4, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_5x5, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_6x5, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_6x6, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_8x5, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_8x6, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_8x8, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_10x5, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_10x6, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_10x8, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_10x10, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_12x10, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_SRGBA8_ASTC_12x12, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_4x4, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_5x4, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_5x5, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_6x5, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_6x6, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_8x5, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_8x6, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_8x8, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_10x5, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_10x6, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_10x8, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_10x10, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_12x10, ///< ASTC compressed SRGBA8 in 128 bits block + CM_SURF_FMT_SRGBA8_ASTC_12x12, ///< ASTC compressed SRGBA8 in 128 bits block - CM_SURF_FMT_BGR10_A2UI, ///< RGBA, 32 bits per element packed as (@c AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) - CM_SURF_FMT_A2_BGR10UI, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA) - CM_SURF_FMT_A2_RGB10UI, ///< RGBA, 32 bits per element packed as (@c BBBBBBBBBBGGGGGGGGGGRRRRRRRRRRAA) - CM_SURF_FMT_B5_G6_R5UI, ///< RGB, 16 bits per element packed as (@c BBBBBGGGGGGRRRRR) - CM_SURF_FMT_R5_G6_B5UI, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB) + CM_SURF_FMT_BGR10_A2UI, ///< RGBA, 32 bits per element packed as (@c + ///< AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB) + CM_SURF_FMT_A2_BGR10UI, ///< RGBA, 32 bits per element packed as (@c + ///< RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA) + CM_SURF_FMT_A2_RGB10UI, ///< RGBA, 32 bits per element packed as (@c + ///< BBBBBBBBBBGGGGGGGGGGRRRRRRRRRRAA) + CM_SURF_FMT_B5_G6_R5UI, ///< RGB, 16 bits per element packed as (@c BBBBBGGGGGGRRRRR) + CM_SURF_FMT_R5_G6_B5UI, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB) - CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, ///< depth + stencil, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_DEPTH32F_UNCLAMPED, ///< depth, 32 bits per element packed as (@c DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, ///< depth + stencil, 64 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) + CM_SURF_FMT_DEPTH32F_UNCLAMPED, ///< depth, 32 bits per element packed as (@c + ///< DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) - CM_SURF_FMT_L8_X16_A8_SRGB, ///< Sluminance Alpha, 32 bits per element packed as (@c AAAAAAAAXXXXXXXXXXXXXXXXLLLLLLLL) - CM_SURF_FMT_L8_X24_SRGB, ///< Sluminance, 32 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXLLLLLLLL) + CM_SURF_FMT_L8_X16_A8_SRGB, ///< Sluminance Alpha, 32 bits per element packed as (@c + ///< AAAAAAAAXXXXXXXXXXXXXXXXLLLLLLLL) + CM_SURF_FMT_L8_X24_SRGB, ///< Sluminance, 32 bits per element packed as (@c + ///< XXXXXXXXXXXXXXXXXXXXXXXXLLLLLLLL) - CM_SURF_FMT_STENCIL8, ///< stencil, 32 bits per element packed as (@c SSSSSSSSXXXXXXXXXXXXXXXXXXXXXXXX) + CM_SURF_FMT_STENCIL8, ///< stencil, 32 bits per element packed as (@c + ///< SSSSSSSSXXXXXXXXXXXXXXXXXXXXXXXX) - // non-native surface formats after this line, will be ignored by HWL - // all non-native surface formats should use the _NN suffix to distinguish - // them from potential corresponding native formats added in the future - CM_SURF_FMT_I420_NN, ///< 4:2:0 Planar Y-U-V format - CM_SURF_FMT_YV12_NN, ///< 4:2:0 Planar Y-V-U format - CM_SURF_FMT_NV12_NN, ///< 4:2:0 Semi-planar Y-UV format - CM_SURF_FMT_NV21_NN, ///< 4:2:0 Semi-planar Y-VU format - cmSurfFmt_FIRST = CM_SURF_FMT_LUMINANCE8, ///< First surface format - cmSurfFmt_LAST = CM_SURF_FMT_STENCIL8, ///< Last native surface format - cmSurfFmt_LAST_NON_NATIVE = CM_SURF_FMT_NV21_NN,///< Last non-native surface format + // non-native surface formats after this line, will be ignored by HWL + // all non-native surface formats should use the _NN suffix to distinguish + // them from potential corresponding native formats added in the future + CM_SURF_FMT_I420_NN, ///< 4:2:0 Planar Y-U-V format + CM_SURF_FMT_YV12_NN, ///< 4:2:0 Planar Y-V-U format + CM_SURF_FMT_NV12_NN, ///< 4:2:0 Semi-planar Y-UV format + CM_SURF_FMT_NV21_NN, ///< 4:2:0 Semi-planar Y-VU format + cmSurfFmt_FIRST = CM_SURF_FMT_LUMINANCE8, ///< First surface format + cmSurfFmt_LAST = CM_SURF_FMT_STENCIL8, ///< Last native surface format + cmSurfFmt_LAST_NON_NATIVE = CM_SURF_FMT_NV21_NN, ///< Last non-native surface format } cmSurfFmt; typedef struct cmFormatXlateRec { - cmSurfFmt raw_cmFormat; - cl_channel_type image_channel_data_type; - cl_channel_order image_channel_order; + cmSurfFmt raw_cmFormat; + cl_channel_type image_channel_data_type; + cl_channel_order image_channel_order; } cmFormatXlateParams; // relates full range of cm surface formats to those supported by CAL -static const cmFormatXlateParams cmFormatXlateTable[] = { - { CM_SURF_FMT_LUMINANCE8, CL_UNORM_INT8, CL_LUMINANCE }, - { CM_SURF_FMT_LUMINANCE16, CL_UNORM_INT16, CL_LUMINANCE }, - { CM_SURF_FMT_LUMINANCE16F, CL_HALF_FLOAT, CL_LUMINANCE }, - { CM_SURF_FMT_LUMINANCE32F, CL_FLOAT, CL_LUMINANCE }, - { CM_SURF_FMT_INTENSITY8, CL_UNORM_INT8, CL_INTENSITY }, - { CM_SURF_FMT_INTENSITY16, CL_UNORM_INT16, CL_INTENSITY }, - { CM_SURF_FMT_INTENSITY16F, CL_HALF_FLOAT, CL_INTENSITY }, - { CM_SURF_FMT_INTENSITY32F, CL_FLOAT, CL_INTENSITY }, - { CM_SURF_FMT_ALPHA8, CL_UNSIGNED_INT8, CL_A }, - { CM_SURF_FMT_ALPHA16, CL_UNORM_INT16, CL_A }, - { CM_SURF_FMT_ALPHA16F, CL_HALF_FLOAT, CL_A }, - { CM_SURF_FMT_ALPHA32F, CL_FLOAT, CL_A }, - { CM_SURF_FMT_LUMINANCE8_ALPHA8, CL_UNSIGNED_INT8, CL_RG }, - { CM_SURF_FMT_LUMINANCE16_ALPHA16, CL_UNSIGNED_INT16, CL_RG }, - { CM_SURF_FMT_LUMINANCE16F_ALPHA16F, CL_HALF_FLOAT, CL_RG }, - { CM_SURF_FMT_LUMINANCE32F_ALPHA32F, CL_FLOAT, CL_RG }, - { CM_SURF_FMT_B2_G3_R3, 500, CL_R }, - { CM_SURF_FMT_B5_G6_R5, CL_UNSIGNED_INT16, CL_RGB }, - { CM_SURF_FMT_BGRX4, 500, CL_BGRA }, - { CM_SURF_FMT_BGR5_X1, CL_UNSIGNED_INT16, CL_RGB }, - { CM_SURF_FMT_BGRX8, CL_UNORM_INT8, CL_BGRA }, - { CM_SURF_FMT_BGR10_X2, CL_UNORM_INT_101010, CL_RGB }, - { CM_SURF_FMT_BGRX16, CL_UNORM_INT16, CL_BGRA }, - { CM_SURF_FMT_BGRX16F, CL_HALF_FLOAT, CL_BGRA }, - { CM_SURF_FMT_BGRX32F, CL_FLOAT, CL_BGRA }, - { CM_SURF_FMT_RGBX4, 500, CL_RGB }, - { CM_SURF_FMT_RGB5_X1, CL_UNORM_INT16, CL_BGRA }, - { CM_SURF_FMT_RGBX8, CL_UNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_RGB10_X2, CL_UNORM_INT_101010, CL_RGBA }, - { CM_SURF_FMT_RGBX16, CL_UNORM_INT16, CL_RGBA }, - { CM_SURF_FMT_RGBX16F, CL_HALF_FLOAT, CL_RGBA }, - { CM_SURF_FMT_RGBX32F, CL_FLOAT, CL_RGBA }, - { CM_SURF_FMT_BGRA4, 500, CL_BGRA }, - { CM_SURF_FMT_BGR5_A1, CL_UNSIGNED_INT16, CL_BGRA }, - { CM_SURF_FMT_BGRA8, CL_UNORM_INT8, CL_BGRA }, - { CM_SURF_FMT_BGR10_A2, 500, CL_BGRA }, - { CM_SURF_FMT_BGRA16, CL_UNORM_INT16, CL_BGRA }, - { CM_SURF_FMT_BGRA16F, CL_UNORM_INT16, CL_BGRA }, - { CM_SURF_FMT_BGRA32F, CL_FLOAT, CL_BGRA }, - { CM_SURF_FMT_RGBA4, 500, CL_RGBA }, - { CM_SURF_FMT_RGB5_A1, CL_UNSIGNED_INT16, CL_RGBA }, - { CM_SURF_FMT_RGBA8, CL_UNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_RGB10_A2, CL_UNORM_INT_101010, CL_RGB }, - { CM_SURF_FMT_RGBA16, CL_UNORM_INT16, CL_RGBA }, - { CM_SURF_FMT_RGBA16F, CL_HALF_FLOAT, CL_RGBA }, - { CM_SURF_FMT_RGBA32I, CL_UNSIGNED_INT32, CL_RGBA }, - { CM_SURF_FMT_RGBA32F, CL_FLOAT, CL_RGBA }, - { CM_SURF_FMT_DUDV8, CL_UNSIGNED_INT8, CL_RG }, - { CM_SURF_FMT_DXT1, 500, CL_R }, - { CM_SURF_FMT_DXT2_3, 500, CL_R }, - { CM_SURF_FMT_DXT4_5, 500, CL_R }, - { CM_SURF_FMT_ATI1N, 500, CL_R }, - { CM_SURF_FMT_ATI2N, 500, CL_R }, - { CM_SURF_FMT_DEPTH16, CL_UNORM_INT16, CL_DEPTH }, - { CM_SURF_FMT_DEPTH16F, CL_HALF_FLOAT, CL_DEPTH }, - { CM_SURF_FMT_DEPTH24_X8, 500, CL_DEPTH }, - { CM_SURF_FMT_DEPTH24F_X8, 500, CL_DEPTH }, - { CM_SURF_FMT_DEPTH24_STEN8, CL_UNORM_INT24, CL_DEPTH_STENCIL }, - { CM_SURF_FMT_DEPTH24F_STEN8, 500, CL_DEPTH_STENCIL }, - { CM_SURF_FMT_DEPTH32F_X24_STEN8, CL_FLOAT, CL_DEPTH_STENCIL }, - { CM_SURF_FMT_DEPTH32F, CL_FLOAT, CL_DEPTH }, - { CM_SURF_FMT_sR11_sG11_sB10, 500, CL_R }, - { CM_SURF_FMT_sU16, CL_SNORM_INT16, CL_R }, - { CM_SURF_FMT_sUV16, CL_SNORM_INT16, CL_RG }, - { CM_SURF_FMT_sUVWQ16, CL_SNORM_INT16, CL_RGBA }, - { CM_SURF_FMT_RG16, CL_UNORM_INT16, CL_RG }, - { CM_SURF_FMT_RG16F, CL_HALF_FLOAT, CL_RG }, - { CM_SURF_FMT_RG32F, CL_FLOAT, CL_RG }, - { CM_SURF_FMT_ABGR4, 500, CL_ARGB }, - { CM_SURF_FMT_A1_BGR5, CL_UNSIGNED_INT16, CL_ARGB }, - { CM_SURF_FMT_ABGR8, CL_UNORM_INT8, CL_ARGB }, - { CM_SURF_FMT_A2_BGR10, CL_UNORM_INT_101010, CL_RGB }, - { CM_SURF_FMT_ABGR16, CL_UNORM_INT16, CL_ARGB }, - { CM_SURF_FMT_ABGR16F, CL_HALF_FLOAT, CL_ARGB }, - { CM_SURF_FMT_ABGR32F, CL_FLOAT, CL_ARGB }, - { CM_SURF_FMT_DXT1A, 500, CL_R }, - { CM_SURF_FMT_sRGB10_A2, 500, CL_RGBA }, - { CM_SURF_FMT_sR8, CL_SNORM_INT8, CL_R }, - { CM_SURF_FMT_sRG8, CL_SNORM_INT8, CL_RG }, - { CM_SURF_FMT_sR32I, CL_SIGNED_INT32, CL_R }, - { CM_SURF_FMT_sRG32I, CL_SIGNED_INT32, CL_RG }, - { CM_SURF_FMT_sRGBA32I, CL_SIGNED_INT32, CL_RGBA }, - { CM_SURF_FMT_R32I, CL_UNSIGNED_INT32, CL_R }, - { CM_SURF_FMT_RG32I, CL_UNSIGNED_INT32, CL_RG }, - { CM_SURF_FMT_RG8, CL_UNORM_INT8, CL_RG }, - { CM_SURF_FMT_sRGBA8, CL_SNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_R11F_G11F_B10F, 500, CL_RGBA }, - { CM_SURF_FMT_RGB9_E5, CL_UNORM_INT8, CL_ARGB }, - { CM_SURF_FMT_LUMINANCE_LATC1, 500, CL_RGBA }, - { CM_SURF_FMT_SIGNED_LUMINANCE_LATC1,500, CL_RGBA }, - { CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA }, - { CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA }, - { CM_SURF_FMT_RED_RGTC1, 500, CL_RGBA }, - { CM_SURF_FMT_SIGNED_RED_RGTC1, 500, CL_RGBA }, - { CM_SURF_FMT_RED_GREEN_RGTC2, 500, CL_RGBA }, - { CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2,500, CL_RGBA }, - { CM_SURF_FMT_R8, CL_UNORM_INT8, CL_R }, - { CM_SURF_FMT_R16, CL_UNORM_INT16, CL_R }, - { CM_SURF_FMT_R16F, CL_HALF_FLOAT, CL_R }, - { CM_SURF_FMT_R32F, CL_FLOAT, CL_R }, - { CM_SURF_FMT_R8I, CL_UNSIGNED_INT8, CL_R }, - { CM_SURF_FMT_sR8I, CL_SIGNED_INT8, CL_R }, - { CM_SURF_FMT_RG8I, CL_UNSIGNED_INT8, CL_RG }, - { CM_SURF_FMT_sRG8I, CL_SIGNED_INT8, CL_RG }, - { CM_SURF_FMT_R16I, CL_UNSIGNED_INT16, CL_R }, - { CM_SURF_FMT_sR16I, CL_SIGNED_INT16, CL_R }, - { CM_SURF_FMT_RG16I, CL_UNSIGNED_INT16, CL_RG }, - { CM_SURF_FMT_sRG16I, CL_SIGNED_INT16, CL_RG }, - { CM_SURF_FMT_RGBA32UI, CL_UNSIGNED_INT32, CL_RGBA }, - { CM_SURF_FMT_RGBX32UI, CL_UNSIGNED_INT32, CL_RGBA }, - { CM_SURF_FMT_ALPHA32UI, CL_UNSIGNED_INT32, CL_R }, - { CM_SURF_FMT_INTENSITY32UI, CL_UNSIGNED_INT32, CL_R }, - { CM_SURF_FMT_LUMINANCE32UI, CL_UNSIGNED_INT32, CL_R }, - { CM_SURF_FMT_LUMINANCE_ALPHA32UI, CL_UNSIGNED_INT32, CL_RG }, - { CM_SURF_FMT_RGBA16UI, CL_UNSIGNED_INT16, CL_RGBA }, - { CM_SURF_FMT_RGBX16UI, CL_UNSIGNED_INT16, CL_RGBA }, - { CM_SURF_FMT_ALPHA16UI, CL_UNSIGNED_INT16, CL_R }, - { CM_SURF_FMT_INTENSITY16UI, CL_UNSIGNED_INT16, CL_R }, - { CM_SURF_FMT_LUMINANCE16UI, CL_UNSIGNED_INT16, CL_R }, - { CM_SURF_FMT_LUMINANCE_ALPHA16UI, CL_UNSIGNED_INT32, CL_RG }, - { CM_SURF_FMT_RGBA8UI, CL_UNSIGNED_INT8, CL_RGBA }, - { CM_SURF_FMT_RGBX8UI, CL_UNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_ALPHA8UI, CL_UNSIGNED_INT8, CL_R }, - { CM_SURF_FMT_INTENSITY8UI, CL_UNSIGNED_INT8, CL_R }, - { CM_SURF_FMT_LUMINANCE8UI, CL_UNSIGNED_INT8, CL_R }, - { CM_SURF_FMT_LUMINANCE_ALPHA8UI, CL_UNSIGNED_INT8, CL_RG }, - { CM_SURF_FMT_sRGBX32I, CL_SIGNED_INT32, CL_RGBA }, - { CM_SURF_FMT_sALPHA32I, CL_SIGNED_INT32, CL_R }, - { CM_SURF_FMT_sINTENSITY32I, CL_SIGNED_INT32, CL_R }, - { CM_SURF_FMT_sLUMINANCE32I, CL_SIGNED_INT32, CL_R }, - { CM_SURF_FMT_sLUMINANCE_ALPHA32I, CL_SIGNED_INT32, CL_RG }, - { CM_SURF_FMT_sRGBA16I, CL_SIGNED_INT16, CL_RGBA }, - { CM_SURF_FMT_sRGBX16I, CL_SIGNED_INT16, CL_RGBA }, - { CM_SURF_FMT_sALPHA16I, CL_SIGNED_INT16, CL_R }, - { CM_SURF_FMT_sINTENSITY16I, CL_SIGNED_INT16, CL_R }, - { CM_SURF_FMT_sLUMINANCE16I, CL_SIGNED_INT16, CL_R }, - { CM_SURF_FMT_sLUMINANCE_ALPHA16I, CL_SIGNED_INT16, CL_RG }, - { CM_SURF_FMT_sRGBA8I, CL_SIGNED_INT8, CL_RGBA }, - { CM_SURF_FMT_sRGBX8I, CL_SIGNED_INT8, CL_RGBA }, - { CM_SURF_FMT_sALPHA8I, CL_SIGNED_INT8, CL_R }, - { CM_SURF_FMT_sINTENSITY8I, CL_SIGNED_INT8, CL_R }, - { CM_SURF_FMT_sLUMINANCE8I, CL_SIGNED_INT8, CL_R }, - { CM_SURF_FMT_sLUMINANCE_ALPHA8I, CM_SURF_FMT_sRG8I, CL_RG }, - { CM_SURF_FMT_sDXT6, 500, CL_R }, - { CM_SURF_FMT_DXT6, 500, CL_R }, - { CM_SURF_FMT_DXT7, 500, CL_R }, - { CM_SURF_FMT_LUMINANCE8_SNORM, CL_SNORM_INT8, CL_R }, - { CM_SURF_FMT_LUMINANCE16_SNORM, CL_SNORM_INT16, CL_R }, - { CM_SURF_FMT_INTENSITY8_SNORM, CL_SNORM_INT8, CL_R }, - { CM_SURF_FMT_INTENSITY16_SNORM, CL_SNORM_INT16, CL_R }, - { CM_SURF_FMT_ALPHA8_SNORM, CL_SNORM_INT8, CL_R }, - { CM_SURF_FMT_ALPHA16_SNORM, CL_SNORM_INT16, CL_R }, - { CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM,CL_SNORM_INT8, CL_RG }, - { CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM,CL_SNORM_INT16, CL_RG }, - { CM_SURF_FMT_R8_SNORM, CL_SNORM_INT8, CL_R }, - { CM_SURF_FMT_R16_SNORM, CL_SNORM_INT16, CL_R }, - { CM_SURF_FMT_RG8_SNORM, CL_SNORM_INT8, CL_RG }, - { CM_SURF_FMT_RG16_SNORM, CL_SNORM_INT16, CL_RG }, - { CM_SURF_FMT_RGBX8_SNORM, CL_SNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_RGBX16_SNORM, CL_SNORM_INT16, CL_RGBA }, - { CM_SURF_FMT_RGBA8_SNORM, CL_SNORM_INT8, CL_RGBA }, - { CM_SURF_FMT_RGBA16_SNORM, CL_SNORM_INT16, CL_RGBA }, - { CM_SURF_FMT_RGB10_A2UI, 500, CL_RGBA }, - { CM_SURF_FMT_RGB32F, 500, CL_RGBA }, - { CM_SURF_FMT_RGB32I, 500, CL_RGBA }, - { CM_SURF_FMT_RGB32UI, 500, CL_RGBA }, - { CM_SURF_FMT_RGBX8_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_DXT1_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_DXT1A_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_DXT2_3_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_DXT4_5_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_DXT7_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_RGB8_ETC2, 500, CL_RGB }, - { CM_SURF_FMT_SRGB8_ETC2, 500, CL_RGB }, - { CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, 500, CL_RGBA }, - { CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ETC2_EAC, 500, CL_RGBA }, - { CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, 500, CL_RGBA }, - { CM_SURF_FMT_R11_EAC, 500, CL_R }, - { CM_SURF_FMT_SIGNED_R11_EAC, 500, CL_R }, - { CM_SURF_FMT_RG11_EAC, 500, CL_RG }, - { CM_SURF_FMT_SIGNED_RG11_EAC, 500, CL_RG }, - { CM_SURF_FMT_RGBA8_ASTC_4x4, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_5x4, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_5x5, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_6x5, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_6x6, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_8x5, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_8x6, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_8x8, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_10x5, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_10x6, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_10x8, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_10x10, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_12x10, 500, CL_RGBA }, - { CM_SURF_FMT_RGBA8_ASTC_12x12, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_4x4, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_5x4, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_5x5, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_6x5, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_6x6, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_8x5, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_8x6, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_8x8, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_10x5, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_10x6, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_10x8, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_10x10, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_12x10, 500, CL_RGBA }, - { CM_SURF_FMT_SRGBA8_ASTC_12x12, 500, CL_RGBA }, - { CM_SURF_FMT_BGR10_A2UI, 500, CL_BGRA }, - { CM_SURF_FMT_A2_BGR10UI, 500, CL_ARGB }, - { CM_SURF_FMT_A2_RGB10UI, 500, CL_ABGR }, - { CM_SURF_FMT_B5_G6_R5UI, 500, CL_BGRA }, - { CM_SURF_FMT_R5_G6_B5UI, 500, CL_RGBA }, - { CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED,CL_UNSIGNED_INT32, CL_R }, - { CM_SURF_FMT_DEPTH32F_UNCLAMPED, CL_FLOAT, CL_R }, - { CM_SURF_FMT_L8_X16_A8_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_L8_X24_SRGB, 500, CL_RGBA }, - { CM_SURF_FMT_STENCIL8, CL_UNSIGNED_INT8, CL_R }, +static const cmFormatXlateParams cmFormatXlateTable[] = { + {CM_SURF_FMT_LUMINANCE8, CL_UNORM_INT8, CL_LUMINANCE}, + {CM_SURF_FMT_LUMINANCE16, CL_UNORM_INT16, CL_LUMINANCE}, + {CM_SURF_FMT_LUMINANCE16F, CL_HALF_FLOAT, CL_LUMINANCE}, + {CM_SURF_FMT_LUMINANCE32F, CL_FLOAT, CL_LUMINANCE}, + {CM_SURF_FMT_INTENSITY8, CL_UNORM_INT8, CL_INTENSITY}, + {CM_SURF_FMT_INTENSITY16, CL_UNORM_INT16, CL_INTENSITY}, + {CM_SURF_FMT_INTENSITY16F, CL_HALF_FLOAT, CL_INTENSITY}, + {CM_SURF_FMT_INTENSITY32F, CL_FLOAT, CL_INTENSITY}, + {CM_SURF_FMT_ALPHA8, CL_UNSIGNED_INT8, CL_A}, + {CM_SURF_FMT_ALPHA16, CL_UNORM_INT16, CL_A}, + {CM_SURF_FMT_ALPHA16F, CL_HALF_FLOAT, CL_A}, + {CM_SURF_FMT_ALPHA32F, CL_FLOAT, CL_A}, + {CM_SURF_FMT_LUMINANCE8_ALPHA8, CL_UNSIGNED_INT8, CL_RG}, + {CM_SURF_FMT_LUMINANCE16_ALPHA16, CL_UNSIGNED_INT16, CL_RG}, + {CM_SURF_FMT_LUMINANCE16F_ALPHA16F, CL_HALF_FLOAT, CL_RG}, + {CM_SURF_FMT_LUMINANCE32F_ALPHA32F, CL_FLOAT, CL_RG}, + {CM_SURF_FMT_B2_G3_R3, 500, CL_R}, + {CM_SURF_FMT_B5_G6_R5, CL_UNSIGNED_INT16, CL_RGB}, + {CM_SURF_FMT_BGRX4, 500, CL_BGRA}, + {CM_SURF_FMT_BGR5_X1, CL_UNSIGNED_INT16, CL_RGB}, + {CM_SURF_FMT_BGRX8, CL_UNORM_INT8, CL_BGRA}, + {CM_SURF_FMT_BGR10_X2, CL_UNORM_INT_101010, CL_RGB}, + {CM_SURF_FMT_BGRX16, CL_UNORM_INT16, CL_BGRA}, + {CM_SURF_FMT_BGRX16F, CL_HALF_FLOAT, CL_BGRA}, + {CM_SURF_FMT_BGRX32F, CL_FLOAT, CL_BGRA}, + {CM_SURF_FMT_RGBX4, 500, CL_RGB}, + {CM_SURF_FMT_RGB5_X1, CL_UNORM_INT16, CL_BGRA}, + {CM_SURF_FMT_RGBX8, CL_UNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_RGB10_X2, CL_UNORM_INT_101010, CL_RGBA}, + {CM_SURF_FMT_RGBX16, CL_UNORM_INT16, CL_RGBA}, + {CM_SURF_FMT_RGBX16F, CL_HALF_FLOAT, CL_RGBA}, + {CM_SURF_FMT_RGBX32F, CL_FLOAT, CL_RGBA}, + {CM_SURF_FMT_BGRA4, 500, CL_BGRA}, + {CM_SURF_FMT_BGR5_A1, CL_UNSIGNED_INT16, CL_BGRA}, + {CM_SURF_FMT_BGRA8, CL_UNORM_INT8, CL_BGRA}, + {CM_SURF_FMT_BGR10_A2, 500, CL_BGRA}, + {CM_SURF_FMT_BGRA16, CL_UNORM_INT16, CL_BGRA}, + {CM_SURF_FMT_BGRA16F, CL_UNORM_INT16, CL_BGRA}, + {CM_SURF_FMT_BGRA32F, CL_FLOAT, CL_BGRA}, + {CM_SURF_FMT_RGBA4, 500, CL_RGBA}, + {CM_SURF_FMT_RGB5_A1, CL_UNSIGNED_INT16, CL_RGBA}, + {CM_SURF_FMT_RGBA8, CL_UNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_RGB10_A2, CL_UNORM_INT_101010, CL_RGB}, + {CM_SURF_FMT_RGBA16, CL_UNORM_INT16, CL_RGBA}, + {CM_SURF_FMT_RGBA16F, CL_HALF_FLOAT, CL_RGBA}, + {CM_SURF_FMT_RGBA32I, CL_UNSIGNED_INT32, CL_RGBA}, + {CM_SURF_FMT_RGBA32F, CL_FLOAT, CL_RGBA}, + {CM_SURF_FMT_DUDV8, CL_UNSIGNED_INT8, CL_RG}, + {CM_SURF_FMT_DXT1, 500, CL_R}, + {CM_SURF_FMT_DXT2_3, 500, CL_R}, + {CM_SURF_FMT_DXT4_5, 500, CL_R}, + {CM_SURF_FMT_ATI1N, 500, CL_R}, + {CM_SURF_FMT_ATI2N, 500, CL_R}, + {CM_SURF_FMT_DEPTH16, CL_UNORM_INT16, CL_DEPTH}, + {CM_SURF_FMT_DEPTH16F, CL_HALF_FLOAT, CL_DEPTH}, + {CM_SURF_FMT_DEPTH24_X8, 500, CL_DEPTH}, + {CM_SURF_FMT_DEPTH24F_X8, 500, CL_DEPTH}, + {CM_SURF_FMT_DEPTH24_STEN8, CL_UNORM_INT24, CL_DEPTH_STENCIL}, + {CM_SURF_FMT_DEPTH24F_STEN8, 500, CL_DEPTH_STENCIL}, + {CM_SURF_FMT_DEPTH32F_X24_STEN8, CL_FLOAT, CL_DEPTH_STENCIL}, + {CM_SURF_FMT_DEPTH32F, CL_FLOAT, CL_DEPTH}, + {CM_SURF_FMT_sR11_sG11_sB10, 500, CL_R}, + {CM_SURF_FMT_sU16, CL_SNORM_INT16, CL_R}, + {CM_SURF_FMT_sUV16, CL_SNORM_INT16, CL_RG}, + {CM_SURF_FMT_sUVWQ16, CL_SNORM_INT16, CL_RGBA}, + {CM_SURF_FMT_RG16, CL_UNORM_INT16, CL_RG}, + {CM_SURF_FMT_RG16F, CL_HALF_FLOAT, CL_RG}, + {CM_SURF_FMT_RG32F, CL_FLOAT, CL_RG}, + {CM_SURF_FMT_ABGR4, 500, CL_ARGB}, + {CM_SURF_FMT_A1_BGR5, CL_UNSIGNED_INT16, CL_ARGB}, + {CM_SURF_FMT_ABGR8, CL_UNORM_INT8, CL_ARGB}, + {CM_SURF_FMT_A2_BGR10, CL_UNORM_INT_101010, CL_RGB}, + {CM_SURF_FMT_ABGR16, CL_UNORM_INT16, CL_ARGB}, + {CM_SURF_FMT_ABGR16F, CL_HALF_FLOAT, CL_ARGB}, + {CM_SURF_FMT_ABGR32F, CL_FLOAT, CL_ARGB}, + {CM_SURF_FMT_DXT1A, 500, CL_R}, + {CM_SURF_FMT_sRGB10_A2, 500, CL_RGBA}, + {CM_SURF_FMT_sR8, CL_SNORM_INT8, CL_R}, + {CM_SURF_FMT_sRG8, CL_SNORM_INT8, CL_RG}, + {CM_SURF_FMT_sR32I, CL_SIGNED_INT32, CL_R}, + {CM_SURF_FMT_sRG32I, CL_SIGNED_INT32, CL_RG}, + {CM_SURF_FMT_sRGBA32I, CL_SIGNED_INT32, CL_RGBA}, + {CM_SURF_FMT_R32I, CL_UNSIGNED_INT32, CL_R}, + {CM_SURF_FMT_RG32I, CL_UNSIGNED_INT32, CL_RG}, + {CM_SURF_FMT_RG8, CL_UNORM_INT8, CL_RG}, + {CM_SURF_FMT_sRGBA8, CL_SNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_R11F_G11F_B10F, 500, CL_RGBA}, + {CM_SURF_FMT_RGB9_E5, CL_UNORM_INT8, CL_ARGB}, + {CM_SURF_FMT_LUMINANCE_LATC1, 500, CL_RGBA}, + {CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, 500, CL_RGBA}, + {CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA}, + {CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA}, + {CM_SURF_FMT_RED_RGTC1, 500, CL_RGBA}, + {CM_SURF_FMT_SIGNED_RED_RGTC1, 500, CL_RGBA}, + {CM_SURF_FMT_RED_GREEN_RGTC2, 500, CL_RGBA}, + {CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, 500, CL_RGBA}, + {CM_SURF_FMT_R8, CL_UNORM_INT8, CL_R}, + {CM_SURF_FMT_R16, CL_UNORM_INT16, CL_R}, + {CM_SURF_FMT_R16F, CL_HALF_FLOAT, CL_R}, + {CM_SURF_FMT_R32F, CL_FLOAT, CL_R}, + {CM_SURF_FMT_R8I, CL_UNSIGNED_INT8, CL_R}, + {CM_SURF_FMT_sR8I, CL_SIGNED_INT8, CL_R}, + {CM_SURF_FMT_RG8I, CL_UNSIGNED_INT8, CL_RG}, + {CM_SURF_FMT_sRG8I, CL_SIGNED_INT8, CL_RG}, + {CM_SURF_FMT_R16I, CL_UNSIGNED_INT16, CL_R}, + {CM_SURF_FMT_sR16I, CL_SIGNED_INT16, CL_R}, + {CM_SURF_FMT_RG16I, CL_UNSIGNED_INT16, CL_RG}, + {CM_SURF_FMT_sRG16I, CL_SIGNED_INT16, CL_RG}, + {CM_SURF_FMT_RGBA32UI, CL_UNSIGNED_INT32, CL_RGBA}, + {CM_SURF_FMT_RGBX32UI, CL_UNSIGNED_INT32, CL_RGBA}, + {CM_SURF_FMT_ALPHA32UI, CL_UNSIGNED_INT32, CL_R}, + {CM_SURF_FMT_INTENSITY32UI, CL_UNSIGNED_INT32, CL_R}, + {CM_SURF_FMT_LUMINANCE32UI, CL_UNSIGNED_INT32, CL_R}, + {CM_SURF_FMT_LUMINANCE_ALPHA32UI, CL_UNSIGNED_INT32, CL_RG}, + {CM_SURF_FMT_RGBA16UI, CL_UNSIGNED_INT16, CL_RGBA}, + {CM_SURF_FMT_RGBX16UI, CL_UNSIGNED_INT16, CL_RGBA}, + {CM_SURF_FMT_ALPHA16UI, CL_UNSIGNED_INT16, CL_R}, + {CM_SURF_FMT_INTENSITY16UI, CL_UNSIGNED_INT16, CL_R}, + {CM_SURF_FMT_LUMINANCE16UI, CL_UNSIGNED_INT16, CL_R}, + {CM_SURF_FMT_LUMINANCE_ALPHA16UI, CL_UNSIGNED_INT32, CL_RG}, + {CM_SURF_FMT_RGBA8UI, CL_UNSIGNED_INT8, CL_RGBA}, + {CM_SURF_FMT_RGBX8UI, CL_UNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_ALPHA8UI, CL_UNSIGNED_INT8, CL_R}, + {CM_SURF_FMT_INTENSITY8UI, CL_UNSIGNED_INT8, CL_R}, + {CM_SURF_FMT_LUMINANCE8UI, CL_UNSIGNED_INT8, CL_R}, + {CM_SURF_FMT_LUMINANCE_ALPHA8UI, CL_UNSIGNED_INT8, CL_RG}, + {CM_SURF_FMT_sRGBX32I, CL_SIGNED_INT32, CL_RGBA}, + {CM_SURF_FMT_sALPHA32I, CL_SIGNED_INT32, CL_R}, + {CM_SURF_FMT_sINTENSITY32I, CL_SIGNED_INT32, CL_R}, + {CM_SURF_FMT_sLUMINANCE32I, CL_SIGNED_INT32, CL_R}, + {CM_SURF_FMT_sLUMINANCE_ALPHA32I, CL_SIGNED_INT32, CL_RG}, + {CM_SURF_FMT_sRGBA16I, CL_SIGNED_INT16, CL_RGBA}, + {CM_SURF_FMT_sRGBX16I, CL_SIGNED_INT16, CL_RGBA}, + {CM_SURF_FMT_sALPHA16I, CL_SIGNED_INT16, CL_R}, + {CM_SURF_FMT_sINTENSITY16I, CL_SIGNED_INT16, CL_R}, + {CM_SURF_FMT_sLUMINANCE16I, CL_SIGNED_INT16, CL_R}, + {CM_SURF_FMT_sLUMINANCE_ALPHA16I, CL_SIGNED_INT16, CL_RG}, + {CM_SURF_FMT_sRGBA8I, CL_SIGNED_INT8, CL_RGBA}, + {CM_SURF_FMT_sRGBX8I, CL_SIGNED_INT8, CL_RGBA}, + {CM_SURF_FMT_sALPHA8I, CL_SIGNED_INT8, CL_R}, + {CM_SURF_FMT_sINTENSITY8I, CL_SIGNED_INT8, CL_R}, + {CM_SURF_FMT_sLUMINANCE8I, CL_SIGNED_INT8, CL_R}, + {CM_SURF_FMT_sLUMINANCE_ALPHA8I, CM_SURF_FMT_sRG8I, CL_RG}, + {CM_SURF_FMT_sDXT6, 500, CL_R}, + {CM_SURF_FMT_DXT6, 500, CL_R}, + {CM_SURF_FMT_DXT7, 500, CL_R}, + {CM_SURF_FMT_LUMINANCE8_SNORM, CL_SNORM_INT8, CL_R}, + {CM_SURF_FMT_LUMINANCE16_SNORM, CL_SNORM_INT16, CL_R}, + {CM_SURF_FMT_INTENSITY8_SNORM, CL_SNORM_INT8, CL_R}, + {CM_SURF_FMT_INTENSITY16_SNORM, CL_SNORM_INT16, CL_R}, + {CM_SURF_FMT_ALPHA8_SNORM, CL_SNORM_INT8, CL_R}, + {CM_SURF_FMT_ALPHA16_SNORM, CL_SNORM_INT16, CL_R}, + {CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, CL_SNORM_INT8, CL_RG}, + {CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, CL_SNORM_INT16, CL_RG}, + {CM_SURF_FMT_R8_SNORM, CL_SNORM_INT8, CL_R}, + {CM_SURF_FMT_R16_SNORM, CL_SNORM_INT16, CL_R}, + {CM_SURF_FMT_RG8_SNORM, CL_SNORM_INT8, CL_RG}, + {CM_SURF_FMT_RG16_SNORM, CL_SNORM_INT16, CL_RG}, + {CM_SURF_FMT_RGBX8_SNORM, CL_SNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_RGBX16_SNORM, CL_SNORM_INT16, CL_RGBA}, + {CM_SURF_FMT_RGBA8_SNORM, CL_SNORM_INT8, CL_RGBA}, + {CM_SURF_FMT_RGBA16_SNORM, CL_SNORM_INT16, CL_RGBA}, + {CM_SURF_FMT_RGB10_A2UI, 500, CL_RGBA}, + {CM_SURF_FMT_RGB32F, 500, CL_RGBA}, + {CM_SURF_FMT_RGB32I, 500, CL_RGBA}, + {CM_SURF_FMT_RGB32UI, 500, CL_RGBA}, + {CM_SURF_FMT_RGBX8_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_DXT1_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_DXT1A_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_DXT2_3_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_DXT4_5_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_DXT7_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_RGB8_ETC2, 500, CL_RGB}, + {CM_SURF_FMT_SRGB8_ETC2, 500, CL_RGB}, + {CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, 500, CL_RGBA}, + {CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ETC2_EAC, 500, CL_RGBA}, + {CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, 500, CL_RGBA}, + {CM_SURF_FMT_R11_EAC, 500, CL_R}, + {CM_SURF_FMT_SIGNED_R11_EAC, 500, CL_R}, + {CM_SURF_FMT_RG11_EAC, 500, CL_RG}, + {CM_SURF_FMT_SIGNED_RG11_EAC, 500, CL_RG}, + {CM_SURF_FMT_RGBA8_ASTC_4x4, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_5x4, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_5x5, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_6x5, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_6x6, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_8x5, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_8x6, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_8x8, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_10x5, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_10x6, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_10x8, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_10x10, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_12x10, 500, CL_RGBA}, + {CM_SURF_FMT_RGBA8_ASTC_12x12, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_4x4, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_5x4, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_5x5, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_6x5, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_6x6, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_8x5, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_8x6, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_8x8, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_10x5, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_10x6, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_10x8, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_10x10, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_12x10, 500, CL_RGBA}, + {CM_SURF_FMT_SRGBA8_ASTC_12x12, 500, CL_RGBA}, + {CM_SURF_FMT_BGR10_A2UI, 500, CL_BGRA}, + {CM_SURF_FMT_A2_BGR10UI, 500, CL_ARGB}, + {CM_SURF_FMT_A2_RGB10UI, 500, CL_ABGR}, + {CM_SURF_FMT_B5_G6_R5UI, 500, CL_BGRA}, + {CM_SURF_FMT_R5_G6_B5UI, 500, CL_RGBA}, + {CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, CL_UNSIGNED_INT32, CL_R}, + {CM_SURF_FMT_DEPTH32F_UNCLAMPED, CL_FLOAT, CL_R}, + {CM_SURF_FMT_L8_X16_A8_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_L8_X24_SRGB, 500, CL_RGBA}, + {CM_SURF_FMT_STENCIL8, CL_UNSIGNED_INT8, CL_R}, }; bool Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const { @@ -557,8 +656,8 @@ bool Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceCont return false; } - pfnMesaGLInteropGLXQueryDeviceInfo = (PFNMesaGLInteropGLXQueryDeviceInfo)dlsym( - pModule, "MesaGLInteropGLXQueryDeviceInfo"); + pfnMesaGLInteropGLXQueryDeviceInfo = + (PFNMesaGLInteropGLXQueryDeviceInfo)dlsym(pModule, "MesaGLInteropGLXQueryDeviceInfo"); if (nullptr == pfnMesaGLInteropGLXQueryDeviceInfo) { return false; } @@ -634,17 +733,17 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const ((1 << properties().gpuIndex) == glChainBitMask); } #else - GLuint glDeviceId = 0 ; - GLuint glChainMask = 0 ; + GLuint glDeviceId = 0; + GLuint glChainMask = 0; GLXContext ctx = static_cast(GLplatformContext); Display* disp = static_cast(GLdeviceContext); if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) { - mesa_glinterop_device_info info = {}; + mesa_glinterop_device_info info = {}; if (pfnMesaGLInteropGLXQueryDeviceInfo(disp, ctx, &info) == 0) { - // match the adapter - canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) && + // match the adapter + canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) && (properties().pciProperties.deviceNumber == info.pci_device) && (properties().pciProperties.functionNumber == info.pci_function) && (static_cast(1 << properties().gpuIndex) == glChainMask); @@ -749,7 +848,7 @@ bool Device::resGLAssociate(void* GLContext, uint name, uint type, Pal::OsExtern return status; } assert(static_cast(hData.format) == cmFormatXlateTable[index].raw_cmFormat); - cl_channel_type imageDataType; + cl_channel_type imageDataType; imageDataType = cmFormatXlateTable[index].image_channel_data_type; if (imageDataType == 500) { LogError("\nGL surface is not supported by OCL\n"); @@ -819,4 +918,4 @@ bool Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) co #endif } -} // pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp index 277d8dec86..ac6ee980be 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp @@ -32,34 +32,27 @@ #include "protocols/rgpServer.h" #include "protocols/driverControlServer.h" -namespace pal -{ +namespace pal { // ================================================================================================ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device) - : - device_(device), - dev_driver_server_(platform->GetDevDriverServer()), - user_event_(nullptr), - num_prep_disp_(0), - max_sqtt_disp_(device_.settings().rgpSqttDispCount_), - trace_gpu_mem_limit_(0), - global_disp_count_(1), // Must start from 1 according to RGP spec - trace_enabled_(false), - inst_tracing_enabled_(false) -{ + : device_(device), + dev_driver_server_(platform->GetDevDriverServer()), + user_event_(nullptr), + num_prep_disp_(0), + max_sqtt_disp_(device_.settings().rgpSqttDispCount_), + trace_gpu_mem_limit_(0), + global_disp_count_(1), // Must start from 1 according to RGP spec + trace_enabled_(false), + inst_tracing_enabled_(false) { memset(&trace_, 0, sizeof(trace_)); } // ================================================================================================ -RgpCaptureMgr::~RgpCaptureMgr() -{ - DestroyRGPTracing(); -} +RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); } // ================================================================================================ // Creates the GPU Open Developer Mode manager class. -RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) -{ +RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) { RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device); if (mgr != nullptr && !mgr->Init(platform)) { @@ -71,8 +64,7 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev } // ================================================================================================ -bool RgpCaptureMgr::Init(Pal::IPlatform* platform) -{ +bool RgpCaptureMgr::Init(Pal::IPlatform* platform) { if (dev_driver_server_ == nullptr) { return false; } @@ -105,13 +97,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) const uint32_t api_version = settings.oclVersion_; - trace_.gpa_session_ = new GpuUtil::GpaSession( - platform, - device_.iDev(), - api_version >> 4, // OCL API version major - api_version & 0xf, // OCL API version minor - RgpSqttInstrumentationSpecVersion, - RgpSqttInstrumentationApiVersion); + trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(), + api_version >> 4, // OCL API version major + api_version & 0xf, // OCL API version minor + RgpSqttInstrumentationSpecVersion, + RgpSqttInstrumentationApiVersion); if (trace_.gpa_session_ == nullptr) { result = false; @@ -119,7 +109,7 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) } // Initialize the GPA session - if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) { + if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) { result = false; } @@ -133,9 +123,9 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) if (!result) { // If we've failed to initialize tracing, permanently disable traces if (rgp_server_ != nullptr) { - rgp_server_->DisableTraces(); + rgp_server_->DisableTraces(); - trace_enabled_ = false; + trace_enabled_ = false; } // Clean up if we failed @@ -150,9 +140,8 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) // ================================================================================================ // This function finds out all the queues in the device that we have to synchronize for RGP-traced // frames and initializes resources for them. -bool RgpCaptureMgr::RegisterTimedQueue( - uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const -{ +bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, + bool* debug_vmid) const { bool result = true; // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; @@ -166,8 +155,8 @@ bool RgpCaptureMgr::RegisterTimedQueue( *debug_vmid = kernelContextInfo.flags.hasDebugVmid; // Register the queue with the GPA session class for timed queue operation support. - if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id, - kernelContextInfo.contextIdentifier) != Pal::Result::Success) { + if (trace_.gpa_session_->RegisterTimedQueue( + iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) { result = false; } @@ -175,11 +164,8 @@ bool RgpCaptureMgr::RegisterTimedQueue( } // ================================================================================================ -Pal::Result RgpCaptureMgr::TimedQueueSubmit( - Pal::IQueue* queue, - uint64_t cmdId, - const Pal::SubmitInfo& submitInfo) const -{ +Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId, + const Pal::SubmitInfo& submitInfo) const { // Fill in extra meta-data information to associate the API command buffer data with // the generated timing information. GpuUtil::TimedSubmitInfo timedSubmitInfo = {}; @@ -205,8 +191,7 @@ Pal::Result RgpCaptureMgr::TimedQueueSubmit( // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit(). // // This finalizes the developer driver manager. -void RgpCaptureMgr::Finalize() -{ +void RgpCaptureMgr::Finalize() { // Figure out if the gfxip supports tracing. We decide tracing if there is at least one // enumerated GPU that can support tracing. Since we don't yet know if that GPU will be // picked as the target of an eventual VkDevice, this check is imperfect. @@ -215,8 +200,8 @@ void RgpCaptureMgr::Finalize() bool hw_support_tracing = false; if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) { - if (GpuSupportsTracing(device_.properties(), device_.settings())) { - hw_support_tracing = true; + if (GpuSupportsTracing(device_.properties(), device_.settings())) { + hw_support_tracing = true; } } @@ -234,20 +219,18 @@ void RgpCaptureMgr::Finalize() // ================================================================================================ // Waits for the driver to be resumed if it's currently paused. -void RgpCaptureMgr::WaitForDriverResume() -{ - auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer(); +void RgpCaptureMgr::WaitForDriverResume() { + auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer(); - assert(pDriverControlServer != nullptr); + assert(pDriverControlServer != nullptr); - pDriverControlServer->WaitForDriverResume(); + pDriverControlServer->WaitForDriverResume(); } // ================================================================================================ // Called before a swap chain presents. This signals a frame-end boundary and // is used to coordinate RGP trace start/stop. -void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) -{ +void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) { if (rgp_server_->TracesEnabled()) { // If there's currently a trace running, submit the trace-end command buffer if (trace_.status_ == TraceStatus::Running) { @@ -257,8 +240,7 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) Pal::Result res = EndRGPHardwareTrace(gpu); if (Pal::Result::ErrorIncompatibleQueue == res) { // continue until we find the right queue... - } - else if (Pal::Result::Success == res) { + } else if (Pal::Result::Success == res) { trace_.sqtt_disp_count_ = 0; } else { FinishRGPTrace(gpu, true); @@ -272,43 +254,42 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) // Currently nothing in the PresentInfo struct is used for inserting a timed present marker. GpuUtil::TimedQueuePresentInfo timedPresentInfo = {}; - //Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo); - //assert(result == Pal::Result::Success); + // Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo); + // assert(result == Pal::Result::Success); } } } // ================================================================================================ -Pal::Result RgpCaptureMgr::CheckForTraceResults() -{ +Pal::Result RgpCaptureMgr::CheckForTraceResults() { assert(trace_.status_ == TraceStatus::WaitingForResults); Pal::Result result = Pal::Result::NotReady; // Check if trace results are ready - if (trace_.gpa_session_->IsReady() && // GPA session is ready - (trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired + if (trace_.gpa_session_->IsReady() && // GPA session is ready + (trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired { bool success = false; // Fetch required trace data size from GPA session size_t traceDataSize = 0; - void* pTraceData = nullptr; + void* pTraceData = nullptr; trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr); // Allocate memory for trace data if (traceDataSize > 0) { - pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256); + pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256); } if (pTraceData != nullptr) { // Get trace data from GPA session if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) == - Pal::Result::Success) { + Pal::Result::Success) { // Transmit trace data to anyone who's listening - auto devResult = rgp_server_->WriteTraceData( - static_cast(pTraceData), traceDataSize); + auto devResult = + rgp_server_->WriteTraceData(static_cast(pTraceData), traceDataSize); success = (devResult == DevDriver::Result::Success); } @@ -317,7 +298,7 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults() } if (success) { - result = Pal::Result::Success; + result = Pal::Result::Success; } } @@ -327,9 +308,8 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults() // ================================================================================================ // Called after a swap chain presents. This signals a (next) frame-begin boundary and is // used to coordinate RGP trace start/stop. -void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, - size_t x, size_t y, size_t z) -{ +void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, + size_t z) { // Wait for the driver to be resumed in case it's been paused. WaitForDriverResume(); @@ -347,8 +327,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, } } } - } - else if (trace_.status_ == TraceStatus::Preparing) { + } else if (trace_.status_ == TraceStatus::Preparing) { // Wait some number of "preparation frames" before starting the trace in order to get enough // timer samples to sync CPU/GPU clock domains. trace_.prepared_disp_count_++; @@ -370,7 +349,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, // Check if we're ending a trace waiting for SQTT to turn off. // If SQTT has turned off, end the trace else if (trace_.status_ == TraceStatus::WaitingForSqtt) { - Pal::Result result = Pal::Result::Success; + Pal::Result result = Pal::Result::Success; if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) { result = EndRGPTrace(gpu); @@ -401,14 +380,17 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel; if (kernel.prog().isInternal()) { constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = { - RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage, - RgpSqttMarkerEventType::CmdCopyImageToBuffer, - RgpSqttMarkerEventType::CmdCopyBufferToImage, - RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer, - RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer, - RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage, - RgpSqttMarkerEventType::CmdScheduler - }; + RgpSqttMarkerEventType::CmdCopyImage, + RgpSqttMarkerEventType::CmdCopyImage, + RgpSqttMarkerEventType::CmdCopyImageToBuffer, + RgpSqttMarkerEventType::CmdCopyBufferToImage, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdFillBuffer, + RgpSqttMarkerEventType::CmdFillImage, + RgpSqttMarkerEventType::CmdScheduler}; for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) { if (kernel.name().compare(BlitName[i]) == 0) { apiEvent = ApiEvents[i]; @@ -418,8 +400,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, } WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name()); // Write disaptch marker - WriteEventWithDimsMarker(gpu, apiEvent, - static_cast(x), static_cast(y), static_cast(z)); + WriteEventWithDimsMarker(gpu, apiEvent, static_cast(x), static_cast(y), + static_cast(z)); } } @@ -428,11 +410,11 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, // ================================================================================================ // This function starts preparing for an RGP trace. Preparation involves some N frames of -// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains. +// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock +// domains. // // This function transitions from the Idle state to the Preparing state. -Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) -{ +Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) { assert(trace_.status_ == TraceStatus::Idle); // We can only trace using a single device at a time currently, so recreate RGP trace @@ -441,32 +423,32 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) const auto traceParameters = rgp_server_->QueryTraceParameters(); - num_prep_disp_ = traceParameters.captureStartIndex; + num_prep_disp_ = traceParameters.captureStartIndex; uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex; // Validate if the captured dispatches are in the range if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) { max_sqtt_disp_ = capture_disp; } - trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024; + trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024; inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens; // Notify the RGP server that we are starting a trace if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { - result = Pal::Result::ErrorUnknown; + result = Pal::Result::ErrorUnknown; } // Tell the GPA session class we're starting a trace if (result == Pal::Result::Success) { GpuUtil::GpaSessionBeginInfo info = {}; - info.flags.enableQueueTiming = true;// trace_.queueTimingEnabled; + info.flags.enableQueueTiming = true; // trace_.queueTimingEnabled; result = trace_.gpa_session_->Begin(info); } trace_.prepared_disp_count_ = 0; - trace_.sqtt_disp_count_ = 0; + trace_.sqtt_disp_count_ = 0; // Sample the timing clocks prior to starting a trace. if (result == Pal::Result::Success) { @@ -476,7 +458,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) if (result == Pal::Result::Success) { // Remember which queue started the trace trace_.prepare_queue_ = gpu; - trace_.begin_queue_ = nullptr; + trace_.begin_queue_ = nullptr; trace_.status_ = TraceStatus::Preparing; } else { @@ -497,8 +479,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) // the "begin trace" information command buffer. // // This function transitions from the Preparing state to the Running state. -Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) -{ +Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) { assert(trace_.status_ == TraceStatus::Preparing); assert(trace_enabled_); @@ -526,8 +507,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) // Fill GPU commands gpu->eventBegin(MainEngine); - trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample( - gpu->queue(MainEngine).iCmd(), sampleConfig); + trace_.gpa_sample_id_ = + trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig); gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_); } @@ -540,7 +521,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) // Make the trace active and remember which queue started it if (result == Pal::Result::Success) { - trace_.status_ = TraceStatus::Running; + trace_.status_ = TraceStatus::Running; trace_.begin_queue_ = gpu; } @@ -551,8 +532,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) // This function submits the command buffer to stop SQTT tracing. Full tracing still continues. // // This function transitions from the Running state to the WaitingForSqtt state. -Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) -{ +Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) { assert(trace_.status_ == TraceStatus::Running); Pal::Result result = Pal::Result::Success; @@ -593,8 +573,7 @@ Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) // This function ends a running RGP trace. // // This function transitions from the WaitingForSqtt state to WaitingForResults state. -Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) -{ +Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) { assert(trace_.status_ == TraceStatus::WaitingForSqtt); Pal::Result result = Pal::Result::Success; @@ -629,8 +608,7 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) // ================================================================================================ // This function resets and possibly cancels a currently active (between begin/end) RGP trace. // It frees any dependent resources. -void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) -{ +void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) { if (trace_.prepare_queue_ == nullptr) { return; } @@ -654,26 +632,25 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) // Reset tracing state to idle trace_.prepared_disp_count_ = 0; - trace_.sqtt_disp_count_ = 0; - trace_.gpa_sample_id_ = 0; - trace_.status_ = TraceStatus::Idle; - trace_.prepare_queue_ = nullptr; - trace_.begin_queue_ = nullptr; + trace_.sqtt_disp_count_ = 0; + trace_.gpa_sample_id_ = 0; + trace_.status_ = TraceStatus::Idle; + trace_.prepare_queue_ = nullptr; + trace_.begin_queue_ = nullptr; } // ================================================================================================ // Destroys device-persistent RGP resources -void RgpCaptureMgr::DestroyRGPTracing() -{ +void RgpCaptureMgr::DestroyRGPTracing() { if (trace_.status_ != TraceStatus::Idle) { - FinishRGPTrace(nullptr, true); + FinishRGPTrace(nullptr, true); } delete user_event_; // Destroy the GPA session if (trace_.gpa_session_ != nullptr) { - //Util::Destructor(trace_.gpa_session_); + // Util::Destructor(trace_.gpa_session_); delete trace_.gpa_session_; trace_.gpa_session_ = nullptr; } @@ -683,18 +660,15 @@ void RgpCaptureMgr::DestroyRGPTracing() // ================================================================================================ // Returns true if the given device properties/settings support tracing. -bool RgpCaptureMgr::GpuSupportsTracing( - const Pal::DeviceProperties& props, - const Settings& settings) -{ +bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props, + const Settings& settings) { return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_; } // ================================================================================================ // Called when a new device is created. This will preallocate reusable RGP trace resources // for that device. -void RgpCaptureMgr::PostDeviceCreate() -{ +void RgpCaptureMgr::PostDeviceCreate() { amd::ScopedLock traceLock(&trace_mutex_); auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer(); @@ -714,8 +688,7 @@ void RgpCaptureMgr::PostDeviceCreate() // ================================================================================================ // Called prior to a device's being destroyed. This will free persistent RGP trace resources for // that device. -void RgpCaptureMgr::PreDeviceDestroy() -{ +void RgpCaptureMgr::PreDeviceDestroy() { amd::ScopedLock traceLock(&trace_mutex_); // If we are idle, we can re-initialize trace resources based on the new device. if (trace_.status_ == TraceStatus::Idle) { @@ -725,9 +698,8 @@ void RgpCaptureMgr::PreDeviceDestroy() // ================================================================================================ // Sets up an Event marker's basic data. -RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker( - const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const -{ +RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu, + RgpSqttMarkerEventType api_type) const { RgpSqttMarkerEvent marker = {}; marker.identifier = RgpSqttMarkerIdentifierEvent; @@ -739,24 +711,19 @@ RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker( } // ================================================================================================ -void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const -{ +void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const { assert((data_size % sizeof(uint32_t)) == 0); assert((data_size / sizeof(uint32_t)) > 0); - gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker( - static_cast(data_size / sizeof(uint32_t)), data); + gpu->queue(MainEngine) + .iCmd() + ->CmdInsertRgpTraceMarker(static_cast(data_size / sizeof(uint32_t)), data); } // ================================================================================================ // Inserts an RGP pre-dispatch marker -void RgpCaptureMgr::WriteEventWithDimsMarker( - const VirtualGPU* gpu, - RgpSqttMarkerEventType apiType, - uint32_t x, - uint32_t y, - uint32_t z) const -{ +void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, + uint32_t x, uint32_t y, uint32_t z) const { assert(apiType != RgpSqttMarkerEventType::Invalid); RgpSqttMarkerEventWithDims eventWithDims = {}; @@ -771,26 +738,24 @@ void RgpCaptureMgr::WriteEventWithDimsMarker( } // ================================================================================================ -void RgpCaptureMgr::WriteBarrierStartMarker( - const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const -{ +void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu, + const Pal::Developer::BarrierData& data) const { if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) { amd::ScopedLock traceLock(&trace_mutex_); RgpSqttMarkerBarrierStart marker = {}; marker.identifier = RgpSqttMarkerIdentifierBarrierStart; - marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId(); - marker.dword02 = data.reason; - marker.internal = true; + marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId(); + marker.dword02 = data.reason; + marker.internal = true; WriteMarker(gpu, &marker, sizeof(marker)); } } // ================================================================================================ -void RgpCaptureMgr::WriteBarrierEndMarker( - const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const -{ +void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu, + const Pal::Developer::BarrierData& data) const { if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) { amd::ScopedLock traceLock(&trace_mutex_); // Copy the operations part and include the same data from previous markers @@ -799,28 +764,28 @@ void RgpCaptureMgr::WriteBarrierEndMarker( auto operations = data.operations; operations.pipelineStalls.u16All |= 0; - operations.caches.u16All |= 0; + operations.caches.u16All |= 0; RgpSqttMarkerBarrierEnd marker = {}; - marker.identifier = RgpSqttMarkerIdentifierBarrierEnd; - marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId(); + marker.identifier = RgpSqttMarkerIdentifierBarrierEnd; + marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId(); - marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe; - marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush; - marker.psPartialFlush = operations.pipelineStalls.psPartialFlush; - marker.csPartialFlush = operations.pipelineStalls.csPartialFlush; - marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe; - marker.syncCpDma = operations.pipelineStalls.syncCpDma; - marker.invalTcp = operations.caches.invalTcp; - marker.invalSqI = operations.caches.invalSqI$; - marker.invalSqK = operations.caches.invalSqK$; - marker.flushTcc = operations.caches.flushTcc; - marker.invalTcc = operations.caches.invalTcc; - marker.flushCb = operations.caches.flushCb; - marker.invalCb = operations.caches.invalCb; - marker.flushDb = operations.caches.flushDb; - marker.invalDb = operations.caches.invalDb; + marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe; + marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush; + marker.psPartialFlush = operations.pipelineStalls.psPartialFlush; + marker.csPartialFlush = operations.pipelineStalls.csPartialFlush; + marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe; + marker.syncCpDma = operations.pipelineStalls.syncCpDma; + marker.invalTcp = operations.caches.invalTcp; + marker.invalSqI = operations.caches.invalSqI$; + marker.invalSqK = operations.caches.invalSqK$; + marker.flushTcc = operations.caches.flushTcc; + marker.invalTcc = operations.caches.invalTcc; + marker.flushCb = operations.caches.flushCb; + marker.invalCb = operations.caches.invalCb; + marker.flushDb = operations.caches.flushDb; + marker.invalDb = operations.caches.invalDb; marker.numLayoutTransitions = 0; @@ -830,9 +795,9 @@ void RgpCaptureMgr::WriteBarrierEndMarker( // ================================================================================================ // Inserts a user event string marker -void RgpCaptureMgr::WriteUserEventMarker( - const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const -{ +void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu, + RgpSqttMarkerUserEventType eventType, + const std::string& name) const { memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString)); user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent; @@ -841,7 +806,8 @@ void RgpCaptureMgr::WriteUserEventMarker( size_t markerSize = sizeof(user_event_->header); if ((eventType != RgpSqttMarkerUserEventPop)) { - size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t)); + size_t strLength = + std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t)); for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) { uint32_t c = static_cast(name[charIdx]); user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4))); @@ -859,4 +825,4 @@ void RgpCaptureMgr::WriteUserEventMarker( } -}; // namespace vk +}; // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp index 52789a581e..af56f6efd3 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp @@ -34,42 +34,36 @@ #include "gpuopen.h" // PAL forward declarations -namespace Pal -{ -class ICmdBuffer; -class IFence; -class IQueueSemaphore; +namespace Pal { +class ICmdBuffer; +class IFence; +class IQueueSemaphore; struct PalPublicSettings; -} +} // namespace Pal // GpuUtil forward declarations -namespace GpuUtil -{ +namespace GpuUtil { class GpaSession; }; // GPUOpen forward declarations -namespace DevDriver -{ +namespace DevDriver { class DevDriverServer; class IMsgChannel; struct MessageBuffer; -namespace DriverControlProtocol -{ +namespace DriverControlProtocol { enum struct DeviceClockMode : uint32_t; class HandlerServer; -} +} // namespace DriverControlProtocol -namespace SettingsProtocol -{ +namespace SettingsProtocol { class HandlerServer; } -} +} // namespace DevDriver -namespace pal -{ +namespace pal { class Settings; class Device; class VirtualGPU; @@ -77,8 +71,7 @@ class HSAILKernel; // ================================================================================================ // RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1) -enum RgpSqttMarkerIdentifier : uint32_t -{ +enum RgpSqttMarkerIdentifier : uint32_t { RgpSqttMarkerIdentifierEvent = 0x0, RgpSqttMarkerIdentifierCbStart = 0x1, RgpSqttMarkerIdentifierCbEnd = 0x2, @@ -98,8 +91,7 @@ enum RgpSqttMarkerIdentifier : uint32_t }; // ================================================================================================ -enum class RgpSqttMarkerEventType : uint32_t -{ +enum class RgpSqttMarkerEventType : uint32_t { CmdNDRangeKernel = 0, CmdScheduler = 1, CmdCopyBuffer = 2, @@ -114,8 +106,7 @@ enum class RgpSqttMarkerEventType : uint32_t }; // ================================================================================================ -enum class RgpSqqtBarrierReason : uint32_t -{ +enum class RgpSqqtBarrierReason : uint32_t { Invalid = 0, MemDependency = 0xC0000000, ProfilingControl = 0xC0000001, @@ -125,129 +116,116 @@ enum class RgpSqqtBarrierReason : uint32_t }; // ================================================================================================ -// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. +// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. // These are generated ahead of draws or dispatches for commands that trigger generation of waves // i.e. draws/dispatches (Table 4). -struct RgpSqttMarkerEvent -{ - union - { - struct - { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t apiType : 24; // The API type for this command - uint32_t hasThreadDims : 1; // Whether thread dimensions are included +struct RgpSqttMarkerEvent { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t apiType : 24; // The API type for this command + uint32_t hasThreadDims : 1; // Whether thread dimensions are included }; - uint32_t dword01; // The first dword + uint32_t dword01; // The first dword }; - union - { - // Some information about the vertex/instance/draw register indices. These values are not + union { + // Some information about the vertex/instance/draw register indices. These values are not // always valid because they are not available for one reason or another: // // - If vertex offset index or instance offset index are not (together) valid, they are both // equal to 0 // - If draw index is not valid, it is equal to the vertex offset index - struct - { - uint32_t cbID : 20; // Command buffer ID for this marker + struct { + uint32_t cbID : 20; // Command buffer ID for this marker uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset - uint32_t instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset - uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw indirect) + uint32_t + instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset + uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw + // indirect) }; - uint32_t dword02; // The second dword + uint32_t dword02; // The second dword }; - union - { - uint32_t cmdID; // Command index within the command buffer - uint32_t dword03; // The third dword + union { + uint32_t cmdID; // Command index within the command buffer + uint32_t dword03; // The third dword }; }; // ================================================================================================ // RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included -struct RgpSqttMarkerEventWithDims -{ - RgpSqttMarkerEvent event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1 - uint32_t threadX; // Work group count in X - uint32_t threadY; // Work group count in Y - uint32_t threadZ; // Work group count in Z +struct RgpSqttMarkerEventWithDims { + RgpSqttMarkerEvent + event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1 + uint32_t threadX; // Work group count in X + uint32_t threadY; // Work group count in Y + uint32_t threadZ; // Work group count in Z }; // ================================================================================================ // RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5) -struct RgpSqttMarkerBarrierStart -{ - union - { - struct - { +struct RgpSqttMarkerBarrierStart { + union { + struct { uint32_t identifier : 4; // Identifier for this marker uint32_t extDwords : 3; // Number of extra dwords following this marker uint32_t cbId : 20; // Command buffer ID within queue uint32_t reserved : 5; // Reserved }; - uint32_t dword01; // The first dword + uint32_t dword01; // The first dword }; - union - { - struct - { + union { + struct { uint32_t driverReason : 31; - uint32_t internal: 1; + uint32_t internal : 1; }; - uint32_t dword02; // The second dword + uint32_t dword02; // The second dword }; }; // ================================================================================================ // RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6) -struct RgpSqttMarkerBarrierEnd -{ - union - { - struct - { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t cbId : 20; // Command buffer ID within queue - uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that timestamp - // to be written. Quintessential full pipeline stall. +struct RgpSqttMarkerBarrierEnd { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t cbId : 20; // Command buffer ID within queue + uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that + // timestamp to be written. Quintessential full pipeline stall. uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete. uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete. uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete. - uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream. + uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream. }; - uint32_t dword01; // The first dword + uint32_t dword01; // The first dword }; - union - { - struct - { - uint32_t syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed. + union { + struct { + uint32_t + syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed. uint32_t invalTcp : 1; // Invalidate the L1 vector caches. uint32_t invalSqI : 1; // Invalidate the SQ instruction caches uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches) uint32_t flushTcc : 1; // Flush L2 uint32_t invalTcc : 1; // Invalidate L2 - uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask) - uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask) - uint32_t flushDb : 1; // Flush DB caches (including htile) - uint32_t invalDb : 1; // Invalidate DB caches (including htile) - uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet - uint32_t reserved : 6; // Reserved for future expansion. Always 0 + uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask) + uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask) + uint32_t flushDb : 1; // Flush DB caches (including htile) + uint32_t invalDb : 1; // Invalidate DB caches (including htile) + uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet + uint32_t reserved : 6; // Reserved for future expansion. Always 0 }; - uint32_t dword02; // The second dword + uint32_t dword02; // The second dword }; }; @@ -255,33 +233,31 @@ struct RgpSqttMarkerBarrierEnd constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; // RGP SQTT Instrumentation Specification version for Vulkan-specific tables -constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; +constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; -// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user event -enum RgpSqttMarkerUserEventType : uint32_t -{ - RgpSqttMarkerUserEventTrigger = 0x0, - RgpSqttMarkerUserEventPop = 0x1, - RgpSqttMarkerUserEventPush = 0x2, - RgpSqttMarkerUserEventObjectName = 0x3, - RgpSqttMarkerUserEventReserved1 = 0x4, - RgpSqttMarkerUserEventReserved2 = 0x5, - RgpSqttMarkerUserEventReserved3 = 0x6, - RgpSqttMarkerUserEventReserved4 = 0x7, +// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user +// event +enum RgpSqttMarkerUserEventType : uint32_t { + RgpSqttMarkerUserEventTrigger = 0x0, + RgpSqttMarkerUserEventPop = 0x1, + RgpSqttMarkerUserEventPush = 0x2, + RgpSqttMarkerUserEventObjectName = 0x3, + RgpSqttMarkerUserEventReserved1 = 0x4, + RgpSqttMarkerUserEventReserved2 = 0x5, + RgpSqttMarkerUserEventReserved3 = 0x6, + RgpSqttMarkerUserEventReserved4 = 0x7, }; // RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event. -union RgpSqttMarkerUserEvent -{ - struct - { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 8; // Number of extra dwords following this marker - uint32_t dataType : 8; // The type for this marker - uint32_t reserved : 12; // reserved - }; +union RgpSqttMarkerUserEvent { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 8; // Number of extra dwords following this marker + uint32_t dataType : 8; // The type for this marker + uint32_t reserved : 12; // reserved + }; - uint32_t dword01; // The first dword + uint32_t dword01; // The first dword }; constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1; @@ -289,21 +265,20 @@ constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1; // The max lengths of frame marker strings static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024; -// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and trigger data types) -struct RgpSqttMarkerUserEventWithString -{ - RgpSqttMarkerUserEvent header; +// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and +// trigger data types) +struct RgpSqttMarkerUserEventWithString { + RgpSqttMarkerUserEvent header; - uint32_t stringLength; // Length of the string (in characters) - uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format + uint32_t stringLength; // Length of the string (in characters) + uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format }; // ================================================================================================ // This class provides functionality to interact with the GPU Open Developer Mode message passing // service and the rest of the driver. -class RgpCaptureMgr -{ -public: +class RgpCaptureMgr { + public: ~RgpCaptureMgr(); static RgpCaptureMgr* Create(Pal::IPlatform* platform, const Device& device); @@ -321,45 +296,42 @@ public: bool IsQueueTimingActive() const; - void WriteBarrierStartMarker( - const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const; - void WriteBarrierEndMarker( - const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const; + void WriteBarrierStartMarker(const VirtualGPU* gpu, + const Pal::Developer::BarrierData& data) const; + void WriteBarrierEndMarker(const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const; bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const; - Pal::Result TimedQueueSubmit( - Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const; + Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId, + const Pal::SubmitInfo& submitInfo) const; -private: + private: // Steps that an RGP trace goes through - enum class TraceStatus - { - Idle = 0, // No active trace and none requested - Preparing, // A trace has been requested but is not active yet because we are - // currently sampling timing information over some number of lead frames. - Running, // SQTT and queue timing is currently active for all command buffer submits. - WaitingForSqtt, - WaitingForResults // Tracing is no longer active, but all results are not yet ready. + enum class TraceStatus { + Idle = 0, // No active trace and none requested + Preparing, // A trace has been requested but is not active yet because we are + // currently sampling timing information over some number of lead frames. + Running, // SQTT and queue timing is currently active for all command buffer submits. + WaitingForSqtt, + WaitingForResults // Tracing is no longer active, but all results are not yet ready. }; // All per-device state to support RGP tracing - struct TraceState - { - TraceStatus status_; // Current trace status (idle, running, etc.) + struct TraceState { + TraceStatus status_; // Current trace status (idle, running, etc.) - GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires - GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires - GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires + GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires + GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires + GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires - VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace - VirtualGPU* begin_queue_; // The queue that triggered starting SQTT + VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace + VirtualGPU* begin_queue_; // The queue that triggered starting SQTT - GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data - uint32_t gpa_sample_id_; // Sample ID associated with the current trace - bool queue_timing_; // Queue timing is enabled + GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data + uint32_t gpa_sample_id_; // Sample ID associated with the current trace + bool queue_timing_; // Queue timing is enabled - uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace - uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active - mutable uint32_t current_event_id_; // Current event ID + uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace + uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active + mutable uint32_t current_event_id_; // Current event ID }; RgpCaptureMgr(Pal::IPlatform* platform, const Device& device); @@ -374,25 +346,25 @@ private: static bool GpuSupportsTracing(const Pal::DeviceProperties& props, const Settings& settings); RgpSqttMarkerEvent BuildEventMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const; void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const; - void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, - uint32_t x, uint32_t y, uint32_t z) const; + void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x, + uint32_t y, uint32_t z) const; void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, - const std::string& name) const; + const std::string& name) const; - const Device& device_; + const Device& device_; DevDriver::DevDriverServer* dev_driver_server_; DevDriver::RGPProtocol::RGPServer* rgp_server_; - mutable amd::Monitor trace_mutex_; - TraceState trace_; + mutable amd::Monitor trace_mutex_; + TraceState trace_; RgpSqttMarkerUserEventWithString* user_event_; - uint32_t num_prep_disp_; - uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace - uint32_t trace_gpu_mem_limit_; - uint32_t global_disp_count_; + uint32_t num_prep_disp_; + uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace + uint32_t trace_gpu_mem_limit_; + uint32_t global_disp_count_; - bool trace_enabled_; // True if tracing is currently enabled (master flag) - bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens + bool trace_enabled_; // True if tracing is currently enabled (master flag) + bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr); PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr); @@ -400,11 +372,9 @@ private: // ================================================================================================ // Returns true if queue operations are currently being timed by RGP traces. -inline bool RgpCaptureMgr::IsQueueTimingActive() const -{ +inline bool RgpCaptureMgr::IsQueueTimingActive() const { return (trace_.queue_timing_ && - (trace_.status_ == TraceStatus::Running || - trace_.status_ == TraceStatus::Preparing || + (trace_.status_ == TraceStatus::Running || trace_.status_ == TraceStatus::Preparing || trace_.status_ == TraceStatus::WaitingForSqtt)); } -}; +}; // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index e23389876b..7a4823ddaa 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -27,11 +27,9 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; namespace pal { void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize, - const uint32_t groupSegmentSize, - const uint16_t numSGPRs, + const uint32_t groupSegmentSize, const uint16_t numSGPRs, const uint16_t numVGPRs) { - workGroupInfo_.scratchRegs_ = - amd::alignUp(privateSegmentSize, 16) / sizeof(uint); + workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint); workGroupInfo_.privateMemSize_ = privateSegmentSize; workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize; workGroupInfo_.usedSGPRs_ = numSGPRs; @@ -63,13 +61,13 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t } // Copy code object of this kernel from the program CPU segment - memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t)); + memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), + sizeof(amd_kernel_code_t)); return true; } bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { - amd_kernel_code_t* akc = &akc_; if (!setKernelCode(sym, akc)) { @@ -77,18 +75,16 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { } if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, - reinterpret_cast(&codeSize_))) { + reinterpret_cast(&codeSize_))) { return false; } - // Setup the the workgroup info - setWorkGroupInfo(akc->workitem_private_segment_byte_size, - akc->workgroup_group_segment_byte_size, - akc->wavefront_sgpr_count, - akc->workitem_vgpr_count); + // Setup the the workgroup info + setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size, + akc->wavefront_sgpr_count, akc->workitem_vgpr_count); workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_; - kernargSegmentByteSize_ = akc->kernarg_segment_byte_size; + kernargSegmentByteSize_ = akc->kernarg_segment_byte_size; spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t)); return true; @@ -102,16 +98,14 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi codeSize_(0), workgroupGroupSegmentByteSize_(0), kernargSegmentByteSize_(0), - spillSegmentByteSize_(0) - { + spillSegmentByteSize_(0) { flags_.hsa_ = true; } -HSAILKernel::~HSAILKernel() { -} +HSAILKernel::~HSAILKernel() {} bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { -#if defined(WITH_COMPILER_LIB) +#if defined(WITH_COMPILER_LIB) acl_error error = ACL_SUCCESS; std::string openClKernelName = openclMangledName(name()); flags_.internalKernel_ = @@ -274,12 +268,14 @@ const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } -hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( - VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, - const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { +hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, + const amd::NDRangeContainer& sizes, + const_address parameters, + size_t ldsAddress, uint64_t vmDefQueue, + uint64_t* vmParentWrap) const { uint64_t argList; address aqlArgBuf = gpu.managedBuffer().reserve( - argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList); + argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList); gpu.addVmMemory(gpu.managedBuffer().activeMemory()); if (dynamicParallelism()) { @@ -307,8 +303,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: if (sizes.dimensions() >= 2) { - offset = sizes.offset()[1]; - WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); + offset = sizes.offset()[1]; + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: @@ -322,8 +318,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( // and printf buffer was allocated (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { // and set the fourth argument as the printf_buffer pointer - size_t bufferPtr = static_cast(gpu.printfDbgHSA(). - dbgBuffer()->vmAddress()); + size_t bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer()); WriteAqlArgAt(const_cast
(parameters), &bufferPtr, it.size_, it.offset_); } @@ -346,11 +341,11 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( // Note: In a case of structs the size won't match, // since HSAIL compiler expects a reference... assert(argsBufferSize() <= signature.paramsSize() && - "A mismatch of sizes of arguments between compiler and runtime!"); + "A mismatch of sizes of arguments between compiler and runtime!"); - //hsa_kernel_dispatch_packet_t disp; - hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast( - gpu.cb(0)->SysMemCopy()); + // hsa_kernel_dispatch_packet_t disp; + hsa_kernel_dispatch_packet_t* hsaDisp = + reinterpret_cast(gpu.cb(0)->SysMemCopy()); amd::NDRange local(sizes.local()); const amd::NDRange& global = sizes.global(); @@ -359,10 +354,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( FindLocalWorkSize(sizes.dimensions(), sizes.global(), local); constexpr uint16_t kDispatchPacketHeader = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); hsaDisp->header = kDispatchPacketHeader; hsaDisp->setup = sizes.dimensions(); @@ -387,7 +382,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t)); if (AMD_HSA_BITS_GET(akc_.kernel_code_properties, - AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { gpu.addVmMemory(gpu.hsaQueueMem()); } @@ -407,7 +402,7 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s } return nullptr; } -#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { @@ -419,7 +414,7 @@ bool LightningKernel::init() { return false; } - KernelMD kernelMD; + KernelMD kernelMD; if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) { return false; } @@ -427,8 +422,8 @@ bool LightningKernel::init() { symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName; workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize; - spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, - sizeof(uint32_t)); + spillSegmentByteSize_ = + amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, sizeof(uint32_t)); kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize; // Copy codeobject of this kernel from the program CPU segment @@ -451,7 +446,7 @@ bool LightningKernel::init() { // Get the runtime handle symbol GPU address rth_symbol = prog().GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()), - const_cast(&agent)); + const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); @@ -461,19 +456,14 @@ bool LightningKernel::init() { uint64_t kernel_object = gpuAqlCode(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); - const struct RuntimeHandle runtime_handle = { - gpuAqlCode(), - spillSegSize(), - ldsSize() - }; + const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()}; codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true); } // Setup the the workgroup info setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize, - kernelMD.mCodeProps.mGroupSegmentFixedSize, - kernelMD.mCodeProps.mNumSGPRs, + kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs, kernelMD.mCodeProps.mNumVGPRs); // Copy wavefront size @@ -499,10 +489,10 @@ bool LightningKernel::init() { return true; } -#endif // defined(USE_COMGR_LIBRARY) +#endif // defined(USE_COMGR_LIBRARY) bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { -#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) +#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY) flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; @@ -545,7 +535,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { // Get the runtime handle symbol GPU address rth_symbol = prog().GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()), - const_cast(&agent)); + const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); @@ -554,11 +544,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { uint64_t offset = symbol_address - codeSegGpu.vmAddress(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); - const struct RuntimeHandle runtime_handle = { - gpuAqlCode(), - spillSegSize(), - ldsSize() - }; + const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()}; codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true); } @@ -584,7 +570,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { waveLimiter_.enable(); */ -#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) return true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index 5a1abe07d4..926d2deccc 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -20,14 +20,14 @@ namespace amd { namespace hsa { namespace loader { class Symbol; -} // loader +} // namespace loader namespace code { namespace Kernel { class Metadata; -} // Kernel -} // code -} // hsa -} // amd +} // namespace Kernel +} // namespace code +} // namespace hsa +} // namespace amd //! \namespace pal PAL Device Implementation namespace pal { @@ -43,7 +43,6 @@ class LightningProgram; */ class HSAILKernel : public device::Kernel { public: - HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions); virtual ~HSAILKernel(); @@ -106,21 +105,19 @@ class HSAILKernel : public device::Kernel { bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc); //! Set up the workgroup info based on the kernel metadata - void setWorkGroupInfo(const uint32_t privateSegmentSize, - const uint32_t groupSegmentSize, - const uint16_t numSGPRs, - const uint16_t numVGPRs); + void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize, + const uint16_t numSGPRs, const uint16_t numVGPRs); - std::string compileOptions_; //!< compile used for finalizing this kernel - amd_kernel_code_t akc_; //!< AQL kernel code on CPU - uint index_; //!< Kernel index in the program + std::string compileOptions_; //!< compile used for finalizing this kernel + amd_kernel_code_t akc_; //!< AQL kernel code on CPU + uint index_; //!< Kernel index in the program - uint64_t code_; //!< GPU memory pointer to the kernel - size_t codeSize_; //!< Size of ISA code + uint64_t code_; //!< GPU memory pointer to the kernel + size_t codeSize_; //!< Size of ISA code - uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel - uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer - uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem + uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel + uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer + uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem }; class LightningKernel : public HSAILKernel { @@ -140,4 +137,5 @@ class LightningKernel : public HSAILKernel { #endif }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp index 071f17962a..bad5652845 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp @@ -23,27 +23,21 @@ namespace pal { Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size) - : device::Memory(owner), Resource(gpuDev, size) - , pinnedMemory_(nullptr) - , parent_(nullptr) { - + : device::Memory(owner), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) { if (owner.parent() != nullptr) { flags_ |= SubMemoryObject; } } Memory::Memory(const Device& gpuDev, size_t size) - : device::Memory(size), Resource(gpuDev, size) - , pinnedMemory_(nullptr) - , parent_(nullptr) { -} + : device::Memory(size), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {} Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth, cl_image_format format, cl_mem_object_type imageType, uint mipLevels) - : device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels) - , pinnedMemory_(nullptr) - , parent_(nullptr) { - + : device::Memory(owner), + Resource(gpuDev, width, height, depth, format, imageType, mipLevels), + pinnedMemory_(nullptr), + parent_(nullptr) { if (owner.parent() != nullptr) { flags_ |= SubMemoryObject; } @@ -51,10 +45,10 @@ Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t he Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth, cl_image_format format, cl_mem_object_type imageType, uint mipLevels) - : device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels) - , pinnedMemory_(nullptr) - , parent_(nullptr) { -} + : device::Memory(size), + Resource(gpuDev, width, height, depth, format, imageType, mipLevels), + pinnedMemory_(nullptr), + parent_(nullptr) {} #ifdef _WIN32 static HANDLE getSharedHandle(IUnknown* pIface) { @@ -130,7 +124,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params break; case Resource::Remote: case Resource::RemoteUSWC: - if ((!desc().tiled_) && (desc().dimSize_ != 3)) { + if ((!desc().tiled_) && (desc().dimSize_ != 3)) { // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; } @@ -402,7 +396,7 @@ Memory::~Memory() { (memoryType() != Resource::ExternalPhysical)) { // Unmap memory if direct access was requested // Note: runtime will perform unmap on the actual resource destruction - //unmap(nullptr); + // unmap(nullptr); } } diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp index d84b23cbe6..2ce3062cce 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp @@ -32,12 +32,12 @@ class Memory : public device::Memory, public Resource { Memory(const Device& gpuDev, //!< GPU device object amd::Memory& owner, //!< Abstraction layer memory object size_t size //!< Memory size for allocation - ); + ); //! Constructor (nonfat version for local scratch mem use without heap block) Memory(const Device& gpuDev, //!< GPU device object size_t size //!< Memory size for allocation - ); + ); //! Constructor memory for images (without global heap allocation) Memory(const Device& gpuDev, //!< GPU device object @@ -48,7 +48,7 @@ class Memory : public device::Memory, public Resource { cl_image_format format, //!< Memory format cl_mem_object_type imageType, //!< CL image type uint mipLevels //!< The number of mip levels - ); + ); //! Constructor memory for images (without global heap allocation) Memory(const Device& gpuDev, //!< GPU device object @@ -59,7 +59,7 @@ class Memory : public device::Memory, public Resource { cl_image_format format, //!< Memory format cl_mem_object_type imageType, //!< CL image type uint mipLevels //!< The number of mip levels - ); + ); //! Default destructor ~Memory(); @@ -70,7 +70,7 @@ class Memory : public device::Memory, public Resource { //! Overloads the resource create method virtual bool create(Resource::MemoryType memType, //!< Memory type Resource::CreateParams* params = NULL //!< Prameters for create - ); + ); //! Allocate memory for API-level maps virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory @@ -78,12 +78,12 @@ class Memory : public device::Memory, public Resource { uint mapFlags, //!< Map flags size_t* rowPitch = NULL, //!< Row pitch for the mapped memory size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + ); //! Pins system memory associated with this memory object virtual bool pinSystemMemory(void* hostPtr, //!< System memory address size_t size //!< Size of allocated system memory - ); + ); //! Releases indirect map surface virtual void releaseIndirectMap() { decIndMapCount(); } @@ -96,15 +96,15 @@ class Memory : public device::Memory, public Resource { uint numLayers = 0, //!< End layer for multilayer map size_t* rowPitch = NULL, //!< Row pitch for the device memory size_t* slicePitch = NULL //!< Slice pitch for the device memory - ); + ); //! Unmap the device memory virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); + ); //! Updates device memory from the owner's host allocation void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object - //! Synchronization flags + //! Synchronization flags device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); //! Updates the owner's host allocation from device memory @@ -115,11 +115,13 @@ class Memory : public device::Memory, public Resource { //! Creates a view from current resource virtual Memory* createBufferView( amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner - ); + ); virtual uint64_t virtualAddress() const override { return vmAddress(); } - virtual const address cpuSrd() const { return reinterpret_cast(const_cast(hwState())); } + virtual const address cpuSrd() const { + return reinterpret_cast(const_cast(hwState())); + } //! Allocates host memory for synchronization with MGPU context void mgpuCacheWriteBack(); @@ -161,8 +163,8 @@ class Memory : public device::Memory, public Resource { //! Disable operator= Memory& operator=(const Memory&); - Memory* pinnedMemory_; //!< Memory used as pinned system memory - const Memory* parent_; //!< Parent memory object + Memory* pinnedMemory_; //!< Memory used as pinned system memory + const Memory* parent_; //!< Parent memory object }; class Buffer : public pal::Memory { @@ -219,7 +221,7 @@ class Image : public pal::Memory { uint mapFlags, //!< Map flags size_t* rowPitch = NULL, //!< Row pitch for the mapped memory size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + ); virtual uint64_t virtualAddress() const override { return hwSrd(); } diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp index edb8077161..69dd871300 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp @@ -11,7 +11,7 @@ #ifndef isinf #ifdef _MSC_VER #define isinf(X) (!_finite(X) && !_isnan(X)) -#else //!_MSC_VER +#else //!_MSC_VER #define isinf(X) (std::isinf(X)) #endif //!_MSC_VER #endif // isinf @@ -19,7 +19,7 @@ #ifndef isnan #ifdef _MSC_VER #define isnan(X) (_isnan(X)) -#else //!_MSC_VER +#else //!_MSC_VER #define isnan(X) (std::isnan(X)) #endif //!_MSC_VER #endif // isnan @@ -55,14 +55,14 @@ class PrintfDbg : public amd::HeapObject { bool init(VirtualGPU& gpu, //!< Virtual GPU object bool printfEnabled, //!< checks for printf const amd::NDRange& size //!< Kernel's workload - ); + ); //! Prints the kernel's debug informaiton from the buffer - bool output(VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const amd::NDRange& size, //!< Kernel's workload + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size, //!< Kernel's workload const std::vector& printfInfo //!< printf info - ); + ); //! Debug buffer size per workitem size_t wiDbgSize() const { return wiDbgSize_; } @@ -81,7 +81,7 @@ class PrintfDbg : public amd::HeapObject { //! Allocates the debug buffer bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory - ); + ); //! Returns TRUE if a float value has to be printed bool checkFloat(const std::string& fmt //!< Format string @@ -105,9 +105,9 @@ class PrintfDbg : public amd::HeapObject { ) const; //! Displays the PrintfDbg - void outputDbgBuffer(const device::PrintfInfo& info,//!< printf info - const uint32_t* workitemData, //!< The PrintfDbg dump buffer - size_t& i //!< index to the data in the buffer + void outputDbgBuffer(const device::PrintfInfo& info, //!< printf info + const uint32_t* workitemData, //!< The PrintfDbg dump buffer + size_t& i //!< index to the data in the buffer ) const; private: @@ -127,7 +127,7 @@ class PrintfDbg : public amd::HeapObject { uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object size_t idx, //!< Workitem global index bool* realloc //!< Returns TRUE if workitem reached the buffer limit - ); + ); //! Unamp the staged buffer void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object @@ -145,13 +145,13 @@ class PrintfDbgHSA : public PrintfDbg { //! Initializes the debug buffer before kernel's execution bool init(VirtualGPU& gpu, //!< Virtual GPU object bool printfEnabled //!< checks for printf - ); + ); //! Prints the kernel's debug informaiton from the buffer - bool output(VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf const std::vector& printfInfo //!< printf info - ); + ); private: //! Disable copy constructor @@ -161,4 +161,5 @@ class PrintfDbgHSA : public PrintfDbg { PrintfDbgHSA& operator=(const PrintfDbgHSA&); }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index ed788bda56..85d404e897 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -65,10 +65,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t align = amd::alignUp(align, sizeof(uint32_t)); amd::Memory* amd_mem_obj = new (prog.dev().context()) - amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align), - // HIP requires SVM allocation for segment code due to possible global variable access and - // global variables are a part of code segment with the latest loader - amd::IS_HIP ? reinterpret_cast(1) : nullptr); + amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align), + // HIP requires SVM allocation for segment code due to possible global variable + // access and global variables are a part of code segment with the latest loader + amd::IS_HIP ? reinterpret_cast(1) : nullptr); if (amd_mem_obj == nullptr) { LogError("[OCL] failed to create a mem object!"); @@ -103,9 +103,9 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t if (zero && !prog.isInternal()) { uint64_t pattern = 0; - size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1; - prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, - amd::Coord3D(0), amd::Coord3D(size)); + size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1; + prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0), + amd::Coord3D(size)); } switch (segment) { @@ -237,7 +237,7 @@ inline static std::vector splitSpaceSeparatedString(char* str) { } bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize) { -#if defined(WITH_COMPILER_LIB) +#if defined(WITH_COMPILER_LIB) // ACL_TYPE_CG stage is not performed for offline compilation hsa_agent_t agent; agent.handle = 1; @@ -262,8 +262,8 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ } size_t kernelNamesSize = 0; - acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, - nullptr, nullptr, &kernelNamesSize); + acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, + nullptr, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; return false; @@ -274,11 +274,11 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; - delete [] kernelNames; + delete[] kernelNames; return false; } std::vector vKernels = splitSpaceSeparatedString(kernelNames); - delete [] kernelNames; + delete[] kernelNames; bool dynamicParallelism = false; for (const auto& it : vKernels) { std::string kernelName(it); @@ -338,12 +338,10 @@ bool HSAILProgram::allocKernelTable() { return true; } -void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { - gpu.addVmMemory(&codeSegGpu()); -} +void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); } const aclTargetInfo& HSAILProgram::info(const char* str) { -#if defined(WITH_COMPILER_LIB) +#if defined(WITH_COMPILER_LIB) acl_error err; std::string arch = "hsail"; if (dev().settings().use64BitPtr_) { @@ -359,7 +357,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) { } bool HSAILProgram::saveBinaryAndSetType(type_t type) { -#if defined(WITH_COMPILER_LIB) +#if defined(WITH_COMPILER_LIB) // Write binary to memory if (rawBinary_ != nullptr) { // Free memory containing rawBinary @@ -378,8 +376,8 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) { return true; } -bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, - size_t* bytes, const char* global_name) const { +bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes, + const char* global_name) const { uint32_t length = 0; size_t offset = 0; uint32_t flags = 0; @@ -456,7 +454,7 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p } /* Retrieve the Offset from global pal::Memory created @ segment::alloc */ - if(!codeSegment_->gpuAddressOffset(reinterpret_cast(*device_pptr), &offset)) { + if (!codeSegment_->gpuAddressOffset(reinterpret_cast(*device_pptr), &offset)) { buildLog_ += "Error: Cannot Retrieve the Address Offset"; buildLog_ += "\n"; return false; @@ -484,13 +482,12 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) { hsa_isa_t isa = {0}; - uint32_t gfxip = 0; + uint32_t gfxip = 0; std::string gfx_target(name); if (gfx_target.find("amdgcn-") == 0) { std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3); gfxip = std::atoi(gfxip_version_str.c_str()); - } - else { + } else { // FIXME: Old way. To be remove. uint32_t shift = 1; size_t last = gfx_target.length(); @@ -508,9 +505,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) { } bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { - uint32_t gfxipVersion = program_->dev().settings().useLightning_ ? - program_->dev().hwInfo()->gfxipVersionLC_ : - program_->dev().hwInfo()->gfxipVersion_; + uint32_t gfxipVersion = program_->dev().settings().useLightning_ + ? program_->dev().hwInfo()->gfxipVersionLC_ + : program_->dev().hwInfo()->gfxipVersion_; uint32_t majorSrc = gfxipVersion / 10; uint32_t minorSrc = gfxipVersion % 10; @@ -519,11 +516,9 @@ bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) if (majorSrc != majorTrg) { return false; - } - else if (minorTrg == minorSrc) { + } else if (minorTrg == minorSrc) { return true; - } - else if (minorTrg < minorSrc) { + } else if (minorTrg < minorSrc) { LogWarning("ISA downgrade for execution!"); return true; } @@ -708,7 +703,7 @@ static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executabl return HSA_STATUS_SUCCESS; } -#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) bool LightningProgram::createBinary(amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) @@ -716,7 +711,7 @@ bool LightningProgram::createBinary(amd::option::Options* options) { LogError("Failed to create ELF binary image!"); return false; } -#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) return true; } @@ -752,10 +747,10 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s } #if defined(USE_COMGR_LIBRARY) - for (const auto &kernelMeta : kernelMetadataMap_) { + for (const auto& kernelMeta : kernelMetadataMap_) { auto kernelName = kernelMeta.first; - auto kernel = new LightningKernel(kernelName, this, - options->origOptionStr + ProcessOptions(options)); + auto kernel = + new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options)); kernels()[kernelName] = kernel; if (!kernel->init()) { @@ -804,9 +799,9 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s maxScratchRegs_ = std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); } -#endif // defined(USE_COMGR_LIBRARY) +#endif // defined(USE_COMGR_LIBRARY) DestroySegmentCpuAccess(); -#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) return true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp index 32e98aab6f..ddc41c0c1d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp @@ -9,15 +9,15 @@ namespace amd { namespace option { class Options; -} // option +} // namespace option namespace hsa { namespace loader { class Loader; class Executable; class Context; -} // loader -} // hsa -} // amd +} // namespace loader +} // namespace hsa +} // namespace amd //! \namespace pal PAL Device Implementation namespace pal { @@ -50,15 +50,16 @@ class Segment : public amd::HeapObject { bool gpuAddressOffset(uint64_t offAddr, size_t* offset); //! Returns address for CPU access in the segment - void* cpuAddress(size_t offset) const - { return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; } + void* cpuAddress(size_t offset) const { + return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; + } void DestroyCpuAccess(); private: - Memory* gpuAccess_; //!< GPU memory for segment access - Memory* cpuAccess_; //!< CPU memory for segment (backing store) - address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store) + Memory* gpuAccess_; //!< GPU memory for segment access + Memory* cpuAccess_; //!< CPU memory for segment (backing store) + address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store) }; class PALHSALoaderContext final : public Context { @@ -166,7 +167,7 @@ class HSAILProgram : public device::Program { } //! Get symbol by name - amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const { + amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const { return executable_->GetSymbol(symbol_name, agent); } @@ -180,11 +181,14 @@ class HSAILProgram : public device::Program { virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize) override; //! Destroys CPU allocations in the code segment - void DestroySegmentCpuAccess() const - { if (codeSegment_ != nullptr) { codeSegment_->DestroyCpuAccess(); } } + void DestroySegmentCpuAccess() const { + if (codeSegment_ != nullptr) { + codeSegment_->DestroyCpuAccess(); + } + } - virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, - size_t* bytes, const char* globalName) const; + virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, size_t* bytes, + const char* globalName) const; private: //! Disable default copy constructor @@ -201,7 +205,7 @@ class HSAILProgram : public device::Program { std::vector globalStores_; //!< Global memory for the program Memory* kernels_; //!< Table with kernel object pointers Memory* codeSegGpu_; //!< GPU memory with code objects - Segment* codeSegment_; //!< Pointer to the code segment for this program + Segment* codeSegment_; //!< Pointer to the code segment for this program uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel std::list staticSamplers_; //!< List od internal static samplers @@ -214,19 +218,17 @@ class HSAILProgram : public device::Program { //! \class Lightning Compiler Program class LightningProgram : public HSAILProgram { public: - LightningProgram(NullDevice& device) - : HSAILProgram(device) { - isLC_ = true; - xnackEnabled_ = dev().hwInfo()->xnackEnabled_; - machineTarget_ = dev().hwInfo()->machineTargetLC_; - } + LightningProgram(NullDevice& device) : HSAILProgram(device) { + isLC_ = true; + xnackEnabled_ = dev().hwInfo()->xnackEnabled_; + machineTarget_ = dev().hwInfo()->machineTargetLC_; + } - LightningProgram(Device& device) - : HSAILProgram(device) { - isLC_ = true; - xnackEnabled_ = dev().hwInfo()->xnackEnabled_; - machineTarget_ = dev().hwInfo()->machineTargetLC_; - } + LightningProgram(Device& device) : HSAILProgram(device) { + isLC_ = true; + xnackEnabled_ = dev().hwInfo()->xnackEnabled_; + machineTarget_ = dev().hwInfo()->machineTargetLC_; + } virtual ~LightningProgram() {} protected: @@ -235,4 +237,5 @@ class LightningProgram : public HSAILProgram { virtual bool createBinary(amd::option::Options* options) override; }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index e3a719cc38..088978846b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -41,8 +41,8 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, if (memRef != nullptr) { result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if ((result != Pal::Result::Success) && - // Free cache if PAL failed allocation - dev.resourceCache().free()) { + // Free cache if PAL failed allocation + dev.resourceCache().free()) { // If cache was freed, then try to allocate again result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); } @@ -154,8 +154,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, // ================================================================================================ GpuMemoryReference::GpuMemoryReference(const Device& dev) - : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) -{} + : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {} // ================================================================================================ GpuMemoryReference::~GpuMemoryReference() { @@ -181,8 +180,7 @@ GpuMemoryReference::~GpuMemoryReference() { iMem()->Unmap(); } if (0 != iMem()) { - if (!(iMem()->Desc().flags.isShared || - iMem()->Desc().flags.isExternal || + if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal || iMem()->Desc().flags.isExternPhys)) { // Update free memory size counters device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true); @@ -368,7 +366,7 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) { case Persistent: createInfo->heapCount = 2; createInfo->heaps[0] = Pal::GpuHeapLocal; - createInfo->heaps[1] = Pal:: GpuHeapGartUswc; + createInfo->heaps[1] = Pal::GpuHeapGartUswc; #ifdef ATI_OS_LINUX // Note: SSG in Linux requires DGMA heap if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) { @@ -401,11 +399,10 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) { } // ================================================================================================ -bool Resource::CreateImage(CreateParams* params) -{ +bool Resource::CreateImage(CreateParams* params) { Pal::Result result; - Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; - Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; + Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0}; + Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1}; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); @@ -417,8 +414,7 @@ bool Resource::CreateImage(CreateParams* params) memRef_->retain(); desc_.cardMemory_ = viewOwner_->desc().cardMemory_; offset_ += viewOwner_->offset_; - } - else { + } else { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = desc().width_ * elementSize(); createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); @@ -427,8 +423,8 @@ bool Resource::CreateImage(CreateParams* params) createInfo.priority = Pal::GpuMemPriority::Normal; memTypeToHeap(&createInfo); // createInfo.priority; - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, - createInfo.alignment, nullptr, &subOffset_); + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, + nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { @@ -477,16 +473,16 @@ bool Resource::CreateImage(CreateParams* params) imgCreateInfo.arraySize = 1; switch (desc_.topology_) { - case CL_MEM_OBJECT_IMAGE3D: - imgCreateInfo.imageType = Pal::ImageType::Tex3d; - viewInfo.viewType = Pal::ImageViewType::Tex3d; - break; - case CL_MEM_OBJECT_IMAGE1D: - case CL_MEM_OBJECT_IMAGE1D_ARRAY: - case CL_MEM_OBJECT_IMAGE1D_BUFFER: - imgCreateInfo.imageType = Pal::ImageType::Tex1d; - viewInfo.viewType = Pal::ImageViewType::Tex1d; - break; + case CL_MEM_OBJECT_IMAGE3D: + imgCreateInfo.imageType = Pal::ImageType::Tex3d; + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + imgCreateInfo.imageType = Pal::ImageType::Tex1d; + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; } if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_; @@ -504,8 +500,7 @@ bool Resource::CreateImage(CreateParams* params) ImgSubresRange.startSubres.arraySlice = imageView->layer_; viewOwner_ = imageView->resource_; image_ = viewOwner_->image_; - } - else if (memoryType() == ImageBuffer) { + } else if (memoryType() == ImageBuffer) { ImageBufferParams* imageBuffer = reinterpret_cast(params); viewOwner_ = imageBuffer->resource_; } @@ -515,11 +510,11 @@ bool Resource::CreateImage(CreateParams* params) ImgSubresRange.numMips = desc().mipLevels_; if ((memoryType() != ImageView) || - //! @todo PAL doesn't allow an SRD view creation with different pixel size - (elementSize() != viewOwner_->elementSize())) { + //! @todo PAL doesn't allow an SRD view creation with different pixel size + (elementSize() != viewOwner_->elementSize())) { imgCreateInfo.usageFlags.shaderRead = true; imgCreateInfo.usageFlags.shaderWrite = - (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true; + (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true; imgCreateInfo.swizzledFormat.format = format; imgCreateInfo.swizzledFormat.swizzle = channels; imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1; @@ -529,10 +524,9 @@ bool Resource::CreateImage(CreateParams* params) uint32_t rowPitch = 0; if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) || - (memoryType() == ImageBuffer)) { + (memoryType() == ImageBuffer)) { tiling = Pal::ImageTiling::Linear; - } - else if (memoryType() == ImageView) { + } else if (memoryType() == ImageView) { tiling = viewOwner_->image_->GetImageCreateInfo().tiling; // Find the new pitch in pixels for the new format rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize(); @@ -540,10 +534,9 @@ bool Resource::CreateImage(CreateParams* params) if (memoryType() == ImageBuffer) { if ((params->owner_ != NULL) && params->owner_->asImage() && - (params->owner_->asImage()->getRowPitch() != 0)) { + (params->owner_->asImage()->getRowPitch() != 0)) { rowPitch = params->owner_->asImage()->getRowPitch() / elementSize(); - } - else { + } else { rowPitch = desc().width_; } } @@ -579,8 +572,8 @@ bool Resource::CreateImage(CreateParams* params) createInfo.priority = Pal::GpuMemPriority::Normal; memTypeToHeap(&createInfo); - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, - createInfo.alignment, nullptr, &subOffset_); + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, + nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { @@ -589,8 +582,7 @@ bool Resource::CreateImage(CreateParams* params) } } offset_ += static_cast(subOffset_); - } - else { + } else { memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.cardMemory_ = viewOwner_->desc().cardMemory_; @@ -627,11 +619,10 @@ bool Resource::CreateImage(CreateParams* params) } // ================================================================================================ -bool Resource::CreateInterop(CreateParams* params) -{ +bool Resource::CreateInterop(CreateParams* params) { Pal::Result result; - Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; - Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; + Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0}; + Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1}; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {}; @@ -645,21 +636,21 @@ bool Resource::CreateInterop(CreateParams* params) OGLInteropParams* oglRes = reinterpret_cast(params); assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); switch (oglRes->type_) { - case InteropVertexBuffer: - glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; - break; - case InteropRenderBuffer: - glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; - break; - case InteropTexture: - case InteropTextureViewLevel: - case InteropTextureViewCube: - glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; - break; - default: - LogError("Unknown OGL interop type!"); - return false; - break; + case InteropVertexBuffer: + glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; + break; + case InteropRenderBuffer: + glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; + break; + case InteropTexture: + case InteropTextureViewLevel: + case InteropTextureViewCube: + glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; + break; + default: + LogError("Unknown OGL interop type!"); + return false; + break; } glPlatformContext_ = oglRes->glPlatformContext_; layer = oglRes->layer_; @@ -667,17 +658,18 @@ bool Resource::CreateInterop(CreateParams* params) mipLevel = oglRes->mipLevel_; if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_, - &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_ + &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_ #ifdef ATI_OS_WIN - , openInfo.doppDesktopInfo + , + openInfo.doppDesktopInfo #endif - )) { + )) { return false; } desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0); format = dev().getPalFormat(desc().format_, &channels); } -#ifdef ATI_OS_WIN +#ifdef ATI_OS_WIN else { D3DInteropParams* d3dRes = reinterpret_cast(params); openInfo.hExternalResource = d3dRes->handle_; @@ -713,8 +705,8 @@ bool Resource::CreateInterop(CreateParams* params) size_t gpuMemSize; if (Pal::Result::Success != - dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize, - &imgCreateInfo)) { + dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize, + &imgCreateInfo)) { return false; } @@ -736,51 +728,51 @@ bool Resource::CreateInterop(CreateParams* params) imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch; switch (misc) { - case 1: // NV12 or P010 formats - switch (layer) { - case -1: - case 0: + case 1: // NV12 or P010 formats + switch (layer) { + case -1: + case 0: + break; + case 1: + // Y - plane size to the offset + // NV12 format. UV is 2 times smaller plane Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + default: + LogError("Unknown Interop View Type"); + return false; + } break; - case 1: - // Y - plane size to the offset - // NV12 format. UV is 2 times smaller plane Y - viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + case 2: // YV12 format + switch (layer) { + case -1: + case 0: + break; + case 1: + // Y - plane size to the offset + // YV12 format. U is 4 times smaller plane than Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.rowPitch >>= 1; + break; + case 2: + // Y + U plane sizes to the offest. + // U plane is 4 times smaller than Y and U == V + viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; + imgCreateInfo.rowPitch >>= 1; + break; + default: + LogError("Unknown Interop View Type"); + return false; + } + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + case 3: // YUY2 format imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; break; default: LogError("Unknown Interop View Type"); return false; - } - break; - case 2: // YV12 format - switch (layer) { - case -1: - case 0: - break; - case 1: - // Y - plane size to the offset - // YV12 format. U is 4 times smaller plane than Y - viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; - imgCreateInfo.rowPitch >>= 1; - break; - case 2: - // Y + U plane sizes to the offest. - // U plane is 4 times smaller than Y and U == V - viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; - imgCreateInfo.rowPitch >>= 1; - break; - default: - LogError("Unknown Interop View Type"); - return false; - } - imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; - break; - case 3: // YUY2 format - imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; - break; - default: - LogError("Unknown Interop View Type"); - return false; } imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); @@ -820,8 +812,7 @@ bool Resource::CreateInterop(CreateParams* params) hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument } - } - else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); if (nullptr == memRef_) { return false; @@ -842,8 +833,7 @@ bool Resource::CreateInterop(CreateParams* params) hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument - } - else { + } else { Pal::ExternalImageOpenInfo imgOpenInfo = {}; Pal::ImageCreateInfo imgCreateInfo = {}; imgOpenInfo.resourceInfo = openInfo; @@ -865,14 +855,14 @@ bool Resource::CreateInterop(CreateParams* params) viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite; viewInfo.viewType = Pal::ImageViewType::Tex2d; switch (imgCreateInfo.imageType) { - case Pal::ImageType::Tex3d: - viewInfo.viewType = Pal::ImageViewType::Tex3d; - break; - case Pal::ImageType::Tex1d: - viewInfo.viewType = Pal::ImageViewType::Tex1d; - break; - default: - break; + case Pal::ImageType::Tex3d: + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case Pal::ImageType::Tex1d: + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; + default: + break; } viewInfo.pImage = image_; viewInfo.swizzledFormat.format = format; @@ -897,14 +887,13 @@ bool Resource::CreateInterop(CreateParams* params) //! It's a workaround for D24S8 format, since PAL doesn't support this format //! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) && - (desc().format_.image_channel_data_type == CL_UNORM_INT24)) { - if (dev().settings().gfx10Plus_) { - hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000; - } - else { - hwState_[1] &= ~0x3c000000; - hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000; - } + (desc().format_.image_channel_data_type == CL_UNORM_INT24)) { + if (dev().settings().gfx10Plus_) { + hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000; + } else { + hwState_[1] &= ~0x3c000000; + hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000; + } } hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); @@ -915,8 +904,7 @@ bool Resource::CreateInterop(CreateParams* params) } // ================================================================================================ -bool Resource::CreatePinned(CreateParams* params) -{ +bool Resource::CreatePinned(CreateParams* params) { PinnedParams* pinned = reinterpret_cast(params); size_t allocSize = pinned->size_; const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_; @@ -926,7 +914,7 @@ bool Resource::CreatePinned(CreateParams* params) if (desc().topology_ == CL_MEM_OBJECT_BUFFER) { // Allign offset to 4K boundary (Vista/Win7 limitation) char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment)); + amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment)); // Find the partial size for unaligned copy hostMemOffset = static_cast(reinterpret_cast(address_) - tmpHost); @@ -940,18 +928,16 @@ bool Resource::CreatePinned(CreateParams* params) } allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment); // hostMemOffset &= ~(0xff); - } - else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { + } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { //! @todo: Width has to be aligned for 3D. //! Need to be replaced with a compute copy // Width aligned by 8 texels if (((desc().width_ % 0x8) != 0) || - // Pitch aligned by 64 bytes - (((desc().width_ * elementSize()) % 0x40) != 0)) { + // Pitch aligned by 64 bytes + (((desc().width_ * elementSize()) % 0x40) != 0)) { return false; } - } - else { + } else { //! @todo GSL doesn't support pinning with resAlloc_ return false; } @@ -978,8 +964,7 @@ bool Resource::CreatePinned(CreateParams* params) } // ================================================================================================ -bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) -{ +bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) { const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote); size_t allocSize = amd::alignUp(desc().width_ * elementSize_, dev().properties().gpuMemoryProperties.fragmentSize); @@ -991,20 +976,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) if (svmPtr != 0) { createInfo.flags.useReservedGpuVa = true; createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); - } - else { + } else { createInfo.flags.useReservedGpuVa = false; createInfo.pReservedGpuVaOwner = nullptr; } if (!dev().settings().svmFineGrainSystem_) { - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, - createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_); + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, + createInfo.pReservedGpuVaOwner, &subOffset_); } if (memRef_ == nullptr) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); } - } - else { + } else { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = allocSize; createInfo.alignment = MaxGpuAlignment; @@ -1015,8 +998,8 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); } memTypeToHeap(&createInfo); - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, - createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_); + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, + createInfo.pReservedGpuVaOwner, &subOffset_); if (memRef_ == nullptr) { createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize; memRef_ = GpuMemoryReference::Create(dev(), createInfo); @@ -1028,9 +1011,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) } desc_.cardMemory_ = false; if ((nullptr != params) && (nullptr != params->owner_) && - (nullptr != params->owner_->getSvmPtr())) { + (nullptr != params->owner_->getSvmPtr())) { params->owner_->setSvmPtr( - reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_)); + reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_)); offset_ += static_cast(subOffset_); } return true; @@ -1126,18 +1109,18 @@ bool Resource::create(MemoryType memType, CreateParams* params) { Pal::gpusize svmPtr = 0; if ((nullptr != params) && (nullptr != params->owner_) && (nullptr != params->owner_->getSvmPtr())) { - svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); - desc_.SVMRes_ = true; - svmPtr = (svmPtr == 1) ? 0 : svmPtr; + svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); + desc_.SVMRes_ = true; + svmPtr = (svmPtr == 1) ? 0 : svmPtr; } if (desc_.SVMRes_) { - return CreateSvm(params, svmPtr); + return CreateSvm(params, svmPtr); } Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = desc().width_ * elementSize_; createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); - createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment; + createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment; createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; @@ -1152,8 +1135,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) { memTypeToHeap(&createInfo); // createInfo.priority; - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, - createInfo.alignment, nullptr, &subOffset_); + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, + nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { @@ -1172,14 +1155,13 @@ bool Resource::create(MemoryType memType, CreateParams* params) { } // ================================================================================================ -void Resource::free() -{ +void Resource::free() { if (memRef_ == nullptr) { return; } const bool wait = - (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View); + (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View); // OCL has to wait, even if resource is placed in the cache, since reallocation can occur // and resource can be reused on another async queue without a wait on a busy operation @@ -1190,8 +1172,7 @@ void Resource::free() for (uint idx = 1; idx < dev().vgpus().size(); ++idx) { dev().vgpus()[idx]->waitForEvent(&events_[idx]); } - } - else { + } else { amd::ScopedLock l(memRef_->gpu_->execution()); memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]); } @@ -1232,8 +1213,7 @@ void Resource::free() // ================================================================================================ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data, - bool waitForEvent) const -{ + bool waitForEvent) const { GpuEvent event; // Write data size bytes to surface @@ -1242,7 +1222,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v gpu.eventBegin(MainEngine); gpu.queue(MainEngine).addCmdMemRef(memRef()); gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size, - reinterpret_cast(data)); + reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); if (waitForEvent) { @@ -1259,8 +1239,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v } // ================================================================================================ -static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) -{ +static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) { if (bytesPerElement == 16) { return Pal::ChNumFormat::X32Y32Z32W32_Uint; } else if (bytesPerElement == 8) { @@ -1292,8 +1271,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, if (desc().buffer_ && !dstResource.desc().buffer_) { imageOffsetx = dstOrigin[0] % dstResource.elementSize(); gpuMemoryOffset = srcOrigin[0] + offset(); - gpuMemoryRowPitch = - (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize(); + gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize(); img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); } else if (!desc().buffer_ && dstResource.desc().buffer_) { @@ -1374,7 +1352,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } copyRegion.gpuMemoryOffset = gpuMemoryOffset; copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; - copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2] + copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) + ? dstOrigin[2] : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region); } else { @@ -1819,17 +1798,14 @@ void Resource::unmap(VirtualGPU* gpu) { } // ================================================================================================ -void Resource::unmapLayers(VirtualGPU* gpu) { - Unimplemented(); -} +void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); } // ================================================================================================ bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) { - MemBuddyAllocator* allocator = new MemBuddyAllocator( - device_, device_->settings().subAllocationChunkSize_, - device_->settings().subAllocationMinSize_); - if (!((allocator != nullptr) && - (allocator->Init() == Pal::Result::Success) && + MemBuddyAllocator* allocator = + new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_, + device_->settings().subAllocationMinSize_); + if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) && heaps_.insert({mem_ref, allocator}).second)) { mem_ref->release(); delete allocator; @@ -1890,8 +1866,7 @@ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) { } // ================================================================================================ -MemorySubAllocator::~MemorySubAllocator() -{ +MemorySubAllocator::~MemorySubAllocator() { // Release memory heap for suballocations for (const auto& it : heaps_) { it.first->release(); @@ -1901,8 +1876,8 @@ MemorySubAllocator::~MemorySubAllocator() // ================================================================================================ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment, - const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) -{ + const Pal::IGpuMemory* reserved_va, + Pal::gpusize* offset) { GpuMemoryReference* mem_ref = nullptr; MemBuddyAllocator* allocator = nullptr; // Check if the resource size and alignment are allowed for suballocation @@ -1927,7 +1902,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize } // We didn't find a valid chunk, so create a new one if (!CreateChunk(reserved_va)) { - return nullptr; + return nullptr; } i++; } while (i < 2); @@ -1936,8 +1911,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize } // ================================================================================================ -bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) -{ +bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) { bool release_mem = false; { amd::ScopedLock l(monitor); @@ -1966,9 +1940,8 @@ ResourceCache::~ResourceCache() { free(); } // ================================================================================================ //! \note the cache works in FILO mode -bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, - GpuMemoryReference* ref, Pal::gpusize offset) -{ +bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref, + Pal::gpusize offset) { bool result = false; size_t size = ref->iMem()->Desc().size; @@ -2017,7 +1990,9 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, // ================================================================================================ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size, - Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) { + Pal::gpusize alignment, + const Pal::IGpuMemory* reserved_va, + Pal::gpusize* offset) { amd::ScopedLock l(&lockCacheOps_); GpuMemoryReference* ref = nullptr; @@ -2051,7 +2026,7 @@ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal ref = it.second; cacheSize_ -= sizeRes; if (entry->type_ == Resource::Local) { - lclCacheSize_ -= sizeRes; + lclCacheSize_ -= sizeRes; } delete it.first; // Remove the found etry from the cache @@ -2078,8 +2053,7 @@ bool ResourceCache::free(size_t minCacheEntries) { } // ================================================================================================ -void ResourceCache::removeLast() -{ +void ResourceCache::removeLast() { std::pair entry; { // Protect access to the global data diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp index 9b4c63f24a..c2fb0bcad0 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -41,11 +41,11 @@ class GpuMemoryReference : public amd::ReferenceCountedObject { //! Get PAL memory object Pal::IGpuMemory* iMem() const { return gpuMem_; } - Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object - void* cpuAddress_; //!< CPU address of this memory - const Device& device_; //!< GPU device + Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object + void* cpuAddress_; //!< CPU address of this memory + const Device& device_; //!< GPU device //! @note: This field is necessary for the thread safe release only - VirtualGPU* gpu_; //!< Resource will be used only on this queue + VirtualGPU* gpu_; //!< Resource will be used only on this queue protected: //! Default destructor @@ -186,7 +186,7 @@ class Resource : public amd::HeapObject { //! Constructor of 1D Resource object Resource(const Device& gpuDev, //!< GPU device object size_t size //!< Resource size - ); + ); //! Constructor of Image Resource object Resource(const Device& gpuDev, //!< GPU device object @@ -196,7 +196,7 @@ class Resource : public amd::HeapObject { cl_image_format format, //!< resource format cl_mem_object_type imageType, //!< CL image type uint mipLevels = 1 //!< Number of mip levels - ); + ); //! Destructor of the resource virtual ~Resource(); @@ -207,7 +207,7 @@ class Resource : public amd::HeapObject { */ virtual bool create(MemoryType memType, //!< memory type CreateParams* params = 0 //!< special parameters for resource allocation - ); + ); /*! \brief Copies a subregion of memory from one resource to another * @@ -253,14 +253,13 @@ class Resource : public amd::HeapObject { Pal::IGpuMemory* iMem() const { return memRef_->iMem(); } //! Returns a pointer to the memory reference - GpuMemoryReference* memRef() const {return memRef_; } + GpuMemoryReference* memRef() const { return memRef_; } //! Returns global memory offset uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; } //! Returns global memory offset - uint64_t vmSize() const - { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); } + uint64_t vmSize() const { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); } //! Returns global memory offset bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; } @@ -279,11 +278,11 @@ class Resource : public amd::HeapObject { // Optimization for multilayer map/unmap uint startLayer = 0, //!< Start layer for multilayer map uint numLayers = 0 //!< End layer for multilayer map - ); + ); //! Unlocks the resource if it was locked void unmap(VirtualGPU* gpu //!< Virtual GPU device object - ); + ); //! Marks the resource as busy void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object @@ -303,7 +302,7 @@ class Resource : public amd::HeapObject { uint flags = 0, //!< Map flags size_t rowPitch = 0, //!< Raw data row pitch size_t slicePitch = 0 //!< Raw data slice pitch - ); + ); //! Performs host read from the resource GPU memory bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object @@ -312,7 +311,7 @@ class Resource : public amd::HeapObject { const amd::Coord3D& size, //!< The number of bytes to write size_t rowPitch = 0, //!< Raw data row pitch size_t slicePitch = 0 //!< Raw data slice pitch - ); + ); //! Gets the resource element size uint elementSize() const { return elementSize_; } @@ -377,7 +376,7 @@ class Resource : public amd::HeapObject { memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / - Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); setBusy(*memRef()->gpu_, GpuEvent::InvalidID); } } @@ -390,33 +389,32 @@ class Resource : public amd::HeapObject { protected: /*! \brief Creates a PAL iamge object, associated with the resource - * - * \return True if we succesfully created a PAL resource - */ - bool CreateImage(CreateParams* params //!< special parameters for resource allocation - ); + * + * \return True if we succesfully created a PAL resource + */ + bool CreateImage(CreateParams* params //!< special parameters for resource allocation + ); /*! \brief Creates a PAL interop object, associated with the resource - * - * \return True if we succesfully created a PAL interop resource - */ - bool CreateInterop(CreateParams* params //!< special parameters for resource allocation - ); + * + * \return True if we succesfully created a PAL interop resource + */ + bool CreateInterop(CreateParams* params //!< special parameters for resource allocation + ); /*! \brief Creates a PAL pinned object, associated with the resource - * - * \return True if we succesfully created a PAL pinned resource - */ - bool CreatePinned(CreateParams* params //!< special parameters for resource allocation - ); + * + * \return True if we succesfully created a PAL pinned resource + */ + bool CreatePinned(CreateParams* params //!< special parameters for resource allocation + ); /*! \brief Creates a PAL SVM object, associated with the resource - * - * \return True if we succesfully created a PAL SVM resource - */ + * + * \return True if we succesfully created a PAL SVM resource + */ bool CreateSvm(CreateParams* params, //!< special parameters for resource allocation - Pal::gpusize svmPtr - ); + Pal::gpusize svmPtr); uint elementSize_; //!< Size of a single element in bytes @@ -433,11 +431,11 @@ class Resource : public amd::HeapObject { */ void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object uint flags = 0 //!< flags for the map operation - ); + ); //! Unlocks the resource with layers if it was locked void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object - ); + ); //! Calls PAL to map a resource void* gpuMemoryMap(size_t* pitch, //!< Pitch value for the image @@ -454,7 +452,7 @@ class Resource : public amd::HeapObject { //! Converts Resource memory type to the PAL heaps void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info - ); + ); const Device& gpuDevice_; //!< GPU device Descriptor desc_; //!< Descriptor for this resource @@ -462,7 +460,7 @@ class Resource : public amd::HeapObject { void* address_; //!< Physical address of this resource size_t offset_; //!< Resource offset GpuMemoryReference* memRef_; //!< PAL resource reference - Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource + Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource const Resource* viewOwner_; //!< GPU resource, which owns this view void* glInteropMbRes_; //!< Mb Res handle uint32_t glType_; //!< GL interop type @@ -485,41 +483,35 @@ class Resource : public amd::HeapObject { typedef Util::BuddyAllocator MemBuddyAllocator; class MemorySubAllocator : public amd::HeapObject { -public: + public: MemorySubAllocator(Device* device) : device_(device) {} ~MemorySubAllocator(); //! Create suballocation - GpuMemoryReference* Allocate(Pal::gpusize size, - Pal::gpusize alignment, - const Pal::IGpuMemory* reserved_va, - Pal::gpusize* offset - ); + GpuMemoryReference* Allocate(Pal::gpusize size, Pal::gpusize alignment, + const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset); //! Free suballocation - bool Free(amd::Monitor* monitor, - GpuMemoryReference* mem_ref, - Pal::gpusize offset - ); + bool Free(amd::Monitor* monitor, GpuMemoryReference* mem_ref, Pal::gpusize offset); -protected: + protected: //! Allocate new chunk of memory virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va); bool InitAllocator(GpuMemoryReference* mem_ref); Device* device_; - std::unordered_map heaps_; + std::unordered_map heaps_; }; class CoarseMemorySubAllocator : public MemorySubAllocator { -public: + public: CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {} bool CreateChunk(const Pal::IGpuMemory* reservedVa) override; }; class FineMemorySubAllocator : public MemorySubAllocator { -public: + public: FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {} bool CreateChunk(const Pal::IGpuMemory* reserved_va) override; @@ -529,29 +521,28 @@ class ResourceCache : public amd::HeapObject { public: //! Default constructor ResourceCache(Device* device, size_t cacheSizeLimit) - : lockCacheOps_("PAL resource cache", true) - , cacheSize_(0) - , lclCacheSize_(0) - , cacheSizeLimit_(cacheSizeLimit) - , mem_sub_alloc_local_(device) - , mem_sub_alloc_coarse_ (device) - , mem_sub_alloc_fine_ (device) {} + : lockCacheOps_("PAL resource cache", true), + cacheSize_(0), + lclCacheSize_(0), + cacheSizeLimit_(cacheSizeLimit), + mem_sub_alloc_local_(device), + mem_sub_alloc_coarse_(device), + mem_sub_alloc_fine_(device) {} //! Default destructor ~ResourceCache(); //! Adds a PAL resource to the cache - bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key - GpuMemoryReference* ref, //!< Resource reference - Pal::gpusize offset //!< Original resource offset - ); + bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key + GpuMemoryReference* ref, //!< Resource reference + Pal::gpusize offset //!< Original resource offset + ); //! Finds a PAL resource from the cache GpuMemoryReference* findGpuMemory( Resource::Descriptor* desc, //!< Resource descriptor - cache key - Pal::gpusize size, - Pal::gpusize alignment, - const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations + Pal::gpusize size, Pal::gpusize alignment, + const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations Pal::gpusize* offset); //! Destroys cache @@ -576,16 +567,17 @@ class ResourceCache : public amd::HeapObject { amd::Monitor lockCacheOps_; //!< Lock to serialise cache access - size_t cacheSize_; //!< Current cache size in bytes - size_t lclCacheSize_; //!< Local memory stored in the cache - const size_t cacheSizeLimit_; //!< Cache size limit in bytes + size_t cacheSize_; //!< Current cache size in bytes + size_t lclCacheSize_; //!< Local memory stored in the cache + const size_t cacheSizeLimit_; //!< Cache size limit in bytes //! PAL resource cache std::list > resCache_; - MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local - CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM - FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM + MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local + CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM + FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp index c663831670..131bb4afed 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp @@ -136,7 +136,7 @@ Settings::Settings() { subAllocationMinSize_ = 4 * Ki; subAllocationChunkSize_ = 64 * Mi; subAllocationMaxSize_ = - std::min(static_cast(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_); + std::min(static_cast(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_); maxCmdBuffers_ = 12; useLightning_ = GPU_ENABLE_LC; @@ -148,8 +148,7 @@ Settings::Settings() { bool Settings::create(const Pal::DeviceProperties& palProp, const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps, - bool reportAsOCL12Device) -{ + bool reportAsOCL12Device) { uint32_t osVer = 0x0; // Disable thread trace by default for all devices @@ -198,8 +197,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Navi10Lite: gfx10Plus_ = true; useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true; - hsailExplicitXnack_ = static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled - || palProp.gpuMemoryProperties.flags.iommuv2Support); + hsailExplicitXnack_ = + static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled || + palProp.gpuMemoryProperties.flags.iommuv2Support); enableWgpMode_ = GPU_ENABLE_WGP_MODE; if (useLightning_) { enableWave32Mode_ = true; @@ -346,7 +346,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp, if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7 if (modifyMaxWorkload.time > 0) { - maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time + maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time } } #endif // defined(_WIN32) diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp index b6e1d95441..6b8ee86768 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp @@ -39,63 +39,63 @@ class Settings : public device::Settings { union { struct { - uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap - uint stagedXferRead_ : 1; //!< Uses a staged buffer read - uint stagedXferWrite_ : 1; //!< Uses a staged buffer write - uint disablePersistent_ : 1; //!< Disables using persistent memory for staging - uint imageSupport_ : 1; //!< Report images support - uint doublePrecision_ : 1; //!< Enables double precision support - uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU - uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU - uint imageDMA_ : 1; //!< Enable direct image DMA transfers - uint viPlus_ : 1; //!< VI and post VI features - uint aiPlus_ : 1; //!< AI and post AI features - uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features - uint threadTraceEnable_ : 1; //!< Thread trace enable - uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent - uint useSingleScratch_ : 1; //!< Allocates single scratch per device - uint svmAtomics_ : 1; //!< SVM device atomics - uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support - uint useDeviceQueue_ : 1; //!< Submit to separate device queue - uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround - uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace - uint rgpSqttForceDisable_: 1; //!< Disables SQTT - uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7 + uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap + uint stagedXferRead_ : 1; //!< Uses a staged buffer read + uint stagedXferWrite_ : 1; //!< Uses a staged buffer write + uint disablePersistent_ : 1; //!< Disables using persistent memory for staging + uint imageSupport_ : 1; //!< Report images support + uint doublePrecision_ : 1; //!< Enables double precision support + uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU + uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU + uint imageDMA_ : 1; //!< Enable direct image DMA transfers + uint viPlus_ : 1; //!< VI and post VI features + uint aiPlus_ : 1; //!< AI and post AI features + uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features + uint threadTraceEnable_ : 1; //!< Thread trace enable + uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent + uint useSingleScratch_ : 1; //!< Allocates single scratch per device + uint svmAtomics_ : 1; //!< SVM device atomics + uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support + uint useDeviceQueue_ : 1; //!< Submit to separate device queue + uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround + uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace + uint rgpSqttForceDisable_ : 1; //!< Disables SQTT + uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7 uint reserved_ : 11; }; uint value_; }; - uint oclVersion_; //!< Reported OpenCL version support - uint debugFlags_; //!< Debug GPU flags - uint hwLDSSize_; //!< HW local data store size - uint maxWorkGroupSize_; //!< Requested workgroup size for this device - uint preferredWorkGroupSize_;//!< Requested preferred workgroup size for this device - uint workloadSplitSize_; //!< Workload split size - uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms - uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms - uint blitEngine_; //!< Blit engine type - uint cacheLineSize_; //!< Cache line size in bytes - uint cacheSize_; //!< L1 cache size in bytes - uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings - uint numDeviceEvents_; //!< The number of device events - uint numWaitEvents_; //!< The number of wait events for device enqueue - uint hostMemDirectAccess_; //!< Enables direct access to the host memory - uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled - size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - size_t stagedXferSize_; //!< Staged buffer size - size_t pinnedXferSize_; //!< Pinned buffer size for transfer - size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer - size_t resourceCacheSize_; //!< Resource cache size in MB - size_t numMemDependencies_; //!< The array size for memory dependencies tracking - uint64_t maxAllocSize_; //!< Maximum single allocation size - uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT - uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue + uint oclVersion_; //!< Reported OpenCL version support + uint debugFlags_; //!< Debug GPU flags + uint hwLDSSize_; //!< HW local data store size + uint maxWorkGroupSize_; //!< Requested workgroup size for this device + uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device + uint workloadSplitSize_; //!< Workload split size + uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms + uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type + uint cacheLineSize_; //!< Cache line size in bytes + uint cacheSize_; //!< L1 cache size in bytes + uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings + uint numDeviceEvents_; //!< The number of device events + uint numWaitEvents_; //!< The number of wait events for device enqueue + uint hostMemDirectAccess_; //!< Enables direct access to the host memory + uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + size_t stagedXferSize_; //!< Staged buffer size + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t resourceCacheSize_; //!< Resource cache size in MB + size_t numMemDependencies_; //!< The array size for memory dependencies tracking + uint64_t maxAllocSize_; //!< Maximum single allocation size + uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT + uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue + + uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations + uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations + uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons - uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations - uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations - uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons - amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler //! Default constructor @@ -106,7 +106,7 @@ class Settings : public device::Settings { const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device - ); + ); private: //! Disable copy constructor @@ -119,4 +119,5 @@ class Settings : public device::Settings { void override(); }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp index 97b2d5e5ca..9691fa71a2 100644 --- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp @@ -40,7 +40,7 @@ class TimeStamp : public amd::HeapObject { Pal::IGpuMemory* iMem, //!< Buffer with the timer values uint memOffset, //!< Offset in the buffer for the current TS address cpuAddr //!< CPU pointer for the values in memory - ); + ); //! Default destructor ~TimeStamp(); @@ -114,4 +114,5 @@ class TimeStampCache : public amd::HeapObject { uint tsOffset_; //!< Active offset in the current mem object }; -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 82e7372933..ff8bffefae 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -70,8 +70,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy if (qCreateInfo.engineType == Pal::EngineTypeExclusiveCompute) { if (it != gpu.dev().exclusiveComputeEnginesId().end()) { qCreateInfo.engineIndex = it->second; - } - else { + } else { return nullptr; } } @@ -97,8 +96,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy } size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize); - VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev, - residency_limit, max_command_buffers); + VirtualGPU::Queue* queue = + new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers); if (queue != nullptr) { address addrQ = reinterpret_cast
(&queue[1]); // Create PAL queue object @@ -163,16 +162,16 @@ VirtualGPU::Queue::~Queue() { } } -Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() -{ - std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName(); +Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() { + std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName(); - const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str(); - // Find the last occurance of the '\\' character and extract the name of the application as wide char. - const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\'); - const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName; + const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str(); + // Find the last occurance of the '\\' character and extract the name of the application as wide + // char. + const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\'); + const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName; - return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName); + return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName); } void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) { @@ -188,8 +187,7 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) { memRef.pGpuMemory = iMem; palMemRefs_.push_back(memRef); // Check SDI memory object - if (iMem->Desc().flags.isExternPhys && - (sdiReferences_.find(iMem) == sdiReferences_.end())) { + if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) { sdiReferences_.insert(iMem); palSdiRefs_.push_back(iMem); } @@ -268,8 +266,7 @@ bool VirtualGPU::Queue::flush() { // Submit command buffer to OS Pal::Result result; if (gpu_.rgpCaptureEna()) { - result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit( - iQueue_, cmdBufIdCurrent_, submitInfo); + result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo); } else { result = iQueue_->Submit(submitInfo); } @@ -383,28 +380,28 @@ void VirtualGPU::Queue::DumpMemoryReferences() const { if (dump.is_open()) { dump << start << " Queue: "; switch (iQueue_->Type()) { - case Pal::QueueTypeCompute: - dump << "Compute"; - break; - case Pal::QueueTypeDma: - dump << "SDMA"; - break; - default: - dump << "unknown"; - break; + case Pal::QueueTypeCompute: + dump << "Compute"; + break; + case Pal::QueueTypeDma: + dump << "SDMA"; + break; + default: + dump << "unknown"; + break; } dump << "\n" - << "Resident memory resources:\n"; + << "Resident memory resources:\n"; uint idx = 0; for (auto it : memReferences_) { dump << " " << idx << "\t["; dump.setf(std::ios::hex, std::ios::basefield); dump.setf(std::ios::showbase); dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", " - << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size; + << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size; dump.setf(std::ios::dec); - dump << "] CbId:" << it.second << - ", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n"; + dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap + << "\n"; idx++; } @@ -414,8 +411,7 @@ void VirtualGPU::Queue::DumpMemoryReferences() const { for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); // Find if the current argument is a memory object - if ((desc.type_ == T_POINTER) && - (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { + if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { dump << " " << desc.name_ << ": " << std::endl; } } @@ -519,7 +515,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) { // note: The array growth shouldn't occur under the normal conditions, // but in a case when SVM path sends the amount of SVM ptrs over // the max size of kernel arguments - MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; + MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; if (nullptr == ptr) { numMemObjectsInQueue_ = 0; return; @@ -527,7 +523,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) { maxMemObjectsInQueue_ <<= 1; memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); delete[] memObjectsInQueue_; - memObjectsInQueue_= ptr; + memObjectsInQueue_ = ptr; } // Adjust the number of active objects @@ -748,7 +744,6 @@ VirtualGPU::VirtualGPU(Device& device) maskGroups_(1), hsaQueueMem_(nullptr), cmdAllocator_(nullptr) { - // Note: Virtual GPU device creation must be a thread safe operation index_ = gpuDevice_.numOfVgpus_++; gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); @@ -780,8 +775,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, createInfo.flags.autoMemoryReuse = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc; createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = - createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = - VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0)); + createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = + VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0)); createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc; createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki; @@ -803,8 +798,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0; uint idx = index() % (dev().numComputeEngines() - firstQueue); - uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 : - (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); + uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs + ? 0 + : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); uint max_cmd_buffers = dev().settings().maxCmdBuffers_; if (dev().numComputeEngines()) { @@ -815,9 +811,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // hwRing_ should be set 0 if forced to have single scratch buffer hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; - queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, - cmdAllocator_, rtCUs, priority, - residency_limit, max_cmd_buffers); + queues_[MainEngine] = + Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs, + priority, residency_limit, max_cmd_buffers); if (nullptr == queues_[MainEngine]) { return false; } @@ -832,20 +828,19 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, sdma = 1; } - queues_[SdmaEngine] = - Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_, - amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal, - residency_limit, max_cmd_buffers); + queues_[SdmaEngine] = Queue::Create( + *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled, + amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { return false; } } else { - queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute, - idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal, - residency_limit, max_cmd_buffers); - if (nullptr == queues_[SdmaEngine]) { - return false; - } + queues_[SdmaEngine] = + Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs, + amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); + if (nullptr == queues_[SdmaEngine]) { + return false; + } } } else { Unimplemented(); @@ -921,7 +916,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, bool dbg_vmid = false; state_.rgpCaptureEnabled_ = true; dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid); - dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid); + dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, + &dbg_vmid); } return true; @@ -1511,99 +1507,99 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { bool unmapMip = false; amd::Image* amdImage; -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); + { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); - amd::Memory* owner = memory->owner(); - const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); - if (nullptr == writeMapInfo) { - LogError("Unmap without map call"); - return; - } - profilingBegin(vcmd, true); - - // Check if image is a mipmap and assign a saved view - amdImage = owner->asImage(); - if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && - (writeMapInfo->baseMip_ != nullptr)) { - // Assign mip level view - amdImage = writeMapInfo->baseMip_; - // Clear unmap flags from the parent image - memory->clearUnmapInfo(vcmd.mapPtr()); - memory = dev().getGpuMemory(amdImage); - unmapMip = true; - writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); - } - - // We used host memory - if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { - if (writeMapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - owner->signalWrite(nullptr); - memory->syncCacheFromHost(*this); + pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + amd::Memory* owner = memory->owner(); + const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + if (nullptr == writeMapInfo) { + LogError("Unmap without map call"); + return; } - // Remove memory from VA cache - dev().removeVACache(memory); - } - // data check was added for persistent memory that failed to get aperture - // and therefore are treated like a remote resource - else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) { - memory->unmap(this); - } else if (memory->mapMemory() != nullptr) { - if (writeMapInfo->isUnmapWrite()) { - amd::Coord3D srcOrigin(0, 0, 0); - // Target is a remote resource, so copy - assert(memory->mapMemory() != nullptr); - if (memory->desc().buffer_) { - if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, - writeMapInfo->origin_, writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - Memory* memoryBuf = memory; - amd::Coord3D origin(writeMapInfo->origin_[0]); - amd::Coord3D size(writeMapInfo->region_[0]); - size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; + profilingBegin(vcmd, true); - amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory()); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); + // Check if image is a mipmap and assign a saved view + amdImage = owner->asImage(); + if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && + (writeMapInfo->baseMip_ != nullptr)) { + // Assign mip level view + amdImage = writeMapInfo->baseMip_; + // Clear unmap flags from the parent image + memory->clearUnmapInfo(vcmd.mapPtr()); + memory = dev().getGpuMemory(amdImage); + unmapMip = true; + writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + } + + // We used host memory + if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { + if (writeMapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + owner->signalWrite(nullptr); + memory->syncCacheFromHost(*this); + } + // Remove memory from VA cache + dev().removeVACache(memory); + } + // data check was added for persistent memory that failed to get aperture + // and therefore are treated like a remote resource + else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) { + memory->unmap(this); + } else if (memory->mapMemory() != nullptr) { + if (writeMapInfo->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->mapMemory() != nullptr); + if (memory->desc().buffer_) { + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + Memory* memoryBuf = memory; + amd::Coord3D origin(writeMapInfo->origin_[0]); + amd::Coord3D size(writeMapInfo->region_[0]); + size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } } else { - memoryBuf = dev().getGpuMemory(bufferFromImage); - } - if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } else { - if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, - writeMapInfo->origin_, writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); + if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } } } + } else { + LogError("Unhandled unmap!"); + vcmd.setStatus(CL_INVALID_VALUE); } - } else { - LogError("Unhandled unmap!"); - vcmd.setStatus(CL_INVALID_VALUE); + + // Clear unmap flags + memory->clearUnmapInfo(vcmd.mapPtr()); + + profilingEnd(vcmd); } - - // Clear unmap flags - memory->clearUnmapInfo(vcmd.mapPtr()); - - profilingEnd(vcmd); -} // Release a view for a mipmap map if (unmapMip) { // Memory release should be outside of the execution lock, @@ -1700,9 +1696,9 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { profilingBegin(cmd); Memory* srcDevMem = static_cast( - cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0])); + cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0])); Memory* dstDevMem = static_cast( - cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0])); + cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0])); bool p2pAllowed = false; #if 0 @@ -1728,16 +1724,15 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); if (p2pAllowed) { - result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, - size, cmd.isEntireMemory()); - } - else { + result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, + cmd.isEntireMemory()); + } else { amd::ScopedLock lock(dev().P2PStageOps()); Memory* dstStgMem = static_cast( - dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); + dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); Memory* srcStgMem = static_cast( - dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); - + dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); + size_t copy_size = Device::kP2PStagingSize; size_t left_size = size[0]; amd::Coord3D stageOffset(0); @@ -1750,11 +1745,11 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { amd::Coord3D cpSize(copy_size); // Perform 2 step transfer with staging buffer - result &= dev().xferMgr().copyBuffer( - *srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize); + result &= + dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize); srcOrigin.c[0] += copy_size; - result &= dstDevMem->dev().xferMgr().copyBuffer( - *srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize); + result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset, + dstOrigin, cpSize); dstOrigin.c[0] += copy_size; } while (left_size > 0); } @@ -1940,10 +1935,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { } // ================================================================================================ -void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) -{ - AmdAqlWrap* wraps = - (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); +void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) { + AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); uint p = 0; for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { if (wraps[i].state != 0) { @@ -1963,11 +1956,9 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ print << "\twait_list: " << wraps[i].wait_list << "\n"; print << "\twait_num: " << wraps[i].wait_num << "\n"; uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress(); - size_t* events = - reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); + size_t* events = reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); for (j = 0; j < wraps[i].wait_num; ++j) { - uint offs = - static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); + uint offs = static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); print << "Wait Event#: " << j << "\n"; print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; @@ -1980,8 +1971,8 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ print << wraps[i].aql.grid_size_z << "]\n"; HSAILKernel* child = nullptr; - for (auto it = hsaKernel.prog().kernels().begin(); - it != hsaKernel.prog().kernels().end(); ++it) { + for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end(); + ++it) { if (wraps[i].aql.kernel_object == static_cast(it->second)->gpuAqlCode()) { child = static_cast(it->second); } @@ -1995,7 +1986,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; - const amd::KernelSignature& signature = child->signature(); + const amd::KernelSignature& signature = child->signature(); // Check if runtime has to setup hidden arguments for (const auto it : signature.parameters()) { @@ -2033,7 +2024,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ continue; } print << "\t" << it.name_ << ": "; - for (int s = it.size_- 1; s >= 0; --s) { + for (int s = it.size_ - 1; s >= 0; --s) { print.width(2); print.fill('0'); print << static_cast(argum[s]); @@ -2047,26 +2038,20 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ } // ================================================================================================ -bool VirtualGPU::PreDeviceEnqueue( - const amd::Kernel& kernel, - const HSAILKernel& hsaKernel, - VirtualGPU** gpuDefQueue, - uint64_t* vmDefQueue) -{ +bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel, + VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) { amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); if (nullptr == defQueue) { LogError("Default device queue wasn't allocated"); return false; - } - else { + } else { if (dev().settings().useDeviceQueue_) { *gpuDefQueue = static_cast(defQueue->vDev()); if ((*gpuDefQueue)->hwRing() == hwRing()) { LogError("Can't submit the child kernels to the same HW ring as the host queue!"); return false; } - } - else { + } else { createVirtualQueue(defQueue->size()); *gpuDefQueue = this; } @@ -2086,15 +2071,10 @@ bool VirtualGPU::PreDeviceEnqueue( } // ================================================================================================ -void VirtualGPU::PostDeviceEnqueue( - const amd::Kernel& kernel, - const HSAILKernel& hsaKernel, - VirtualGPU* gpuDefQueue, - uint64_t vmDefQueue, - uint64_t vmParentWrap, - GpuEvent* gpuEvent) -{ - uint32_t id = gpuEvent->id_; +void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel, + VirtualGPU* gpuDefQueue, uint64_t vmDefQueue, + uint64_t vmParentWrap, GpuEvent* gpuEvent) { + uint32_t id = gpuEvent->id_; amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); // Make sure exculsive access to the device queue @@ -2110,16 +2090,16 @@ void VirtualGPU::PostDeviceEnqueue( // Add the termination handshake to the host queue eventBegin(MainEngine); iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, - dev().settings().useDeviceQueue_); + vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, + dev().settings().useDeviceQueue_); eventEnd(MainEngine, *gpuEvent); } // Get the global loop start before the scheduler Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); static_cast(gpuDefQueue->blitMgr()) - .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0, - gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2); @@ -2127,8 +2107,7 @@ void VirtualGPU::PostDeviceEnqueue( //! @note DMA flush must not occur between patch and the scheduler Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler - SchedulerParam* param = reinterpret_cast( - gpuDefQueue->schedParams_->data()); + SchedulerParam* param = reinterpret_cast(gpuDefQueue->schedParams_->data()); param->signal = 1; // Scale clock to 1024 to avoid 64 bit div in the scheduler param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; @@ -2147,8 +2126,7 @@ void VirtualGPU::PostDeviceEnqueue( param->numMaxWaves = 32 * dev().info().maxComputeUnits_; param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; addVmMemory(scratchBuf); - } - else { + } else { param->numMaxWaves = 0; param->scratchSize = 0; param->scratch = 0; @@ -2162,8 +2140,8 @@ void VirtualGPU::PostDeviceEnqueue( Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress(); gpuDefQueue->eventBegin(MainEngine); gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( - signalAddr, loopStart, - gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + signalAddr, loopStart, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. // Thus TS command for profiling has to follow in the next CB. constexpr bool ForceSubmitFirst = true; @@ -2173,10 +2151,10 @@ void VirtualGPU::PostDeviceEnqueue( // Add the termination handshake to the host queue eventBegin(MainEngine); iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr, dev().settings().useDeviceQueue_); + vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr, + dev().settings().useDeviceQueue_); if (id != gpuEvent->id_) { - LogError("Something is wrong. ID mismatch!\n"); + LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, *gpuEvent); } @@ -2193,7 +2171,8 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { profilingBegin(vcmd); // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) { + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), + vcmd.sharedMemBytes())) { vcmd.setStatus(CL_INVALID_OPERATION); } @@ -2203,10 +2182,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, bool nativeMem, - amd::Event* enqueueEvent, uint32_t sharedMemBytes) -{ - size_t newOffset[3] = { 0, 0, 0 }; - size_t newGlobalSize[3] = { 0, 0, 0 }; + amd::Event* enqueueEvent, uint32_t sharedMemBytes) { + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; int dim = -1; int iteration = 1; @@ -2221,17 +2199,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // If RGP capturing is enabled, then start SQTT trace if (rgpCaptureEna()) { - size_t newLocalSize[3] = { 1, 1, 1 }; + size_t newLocalSize[3] = {1, 1, 1}; for (uint i = 0; i < sizes.dimensions(); i++) { if (sizes.local()[i] != 0) { newLocalSize[i] = sizes.local()[i]; } } - dev().rgpCaptureMgr()->PreDispatch(this, hsaKernel, - // Report global size in workgroups, since that's the RGP trace semantics - newGlobalSize[0] / newLocalSize[0], - newGlobalSize[1] / newLocalSize[1], - newGlobalSize[2] / newLocalSize[2]); + dev().rgpCaptureMgr()->PreDispatch( + this, hsaKernel, + // Report global size in workgroups, since that's the RGP trace semantics + newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1], + newGlobalSize[2] / newLocalSize[2]); } bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false; @@ -2257,8 +2235,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // Check memory dependency and SVM objects if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) { - LogError("Wrong memory objects!"); - return false; + LogError("Wrong memory objects!"); + return false; } bool needFlush = false; // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd @@ -2305,15 +2283,14 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // an extra loop is required. const amd::KernelParameters& kernelParams = kernel.parameters(); amd::Memory* const* memories = - reinterpret_cast(parameters + kernelParams.memoryObjOffset()); + reinterpret_cast(parameters + kernelParams.memoryObjOffset()); for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) { if (nativeMem) { Memory* gpuMem = reinterpret_cast(memories)[i]; if (gpuMem != nullptr) { gpuMem->setBusy(*this, gpuEvent); } - } - else { + } else { amd::Memory* mem = memories[i]; if (mem != nullptr) { dev().getGpuMemory(mem)->setBusy(*this, gpuEvent); @@ -2325,7 +2302,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const uint64_t vmParentWrap = 0; // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( - *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap); + *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap); if (nullptr == aqlPkt) { LogError("Couldn't load kernel arguments"); return false; @@ -2348,8 +2325,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); - dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? - enqueueEvent->profilingInfo().waves_ : 0; + dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize(); dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); @@ -2660,7 +2636,6 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { eventEnd(MainEngine, gpuEvent); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { - EngineType activeEngineID = engineID_; engineID_ = static_cast(pGpuMemory->getGpuEvent(*this)->engineId_); @@ -2669,8 +2644,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2); // Workarounds: We had systems where an extra delay was necessary. { - // Flush CB associated with the DGMA buffer - isDone(pGpuMemory->getGpuEvent(*this)); + // Flush CB associated with the DGMA buffer + isDone(pGpuMemory->getGpuEvent(*this)); } eventBegin(engineID_); @@ -2711,10 +2686,11 @@ void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd pGpuMems[i] = pGpuMemory->iMem(); } - dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim); + dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, + Pal::GpuMemoryRefCantTrim); dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems); if (numObjects != 0) { - dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_); + dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_); } for (uint i = 0; i < numObjects; i++) { @@ -3104,8 +3080,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p break; } // get svm non arugment information - void* const* svmPtrArray = reinterpret_cast( - params + kernelParams.getExecInfoOffset()); + void* const* svmPtrArray = + reinterpret_cast(params + kernelParams.getExecInfoOffset()); for (size_t i = 0; i < count; i++) { amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]); if (nullptr == memory) { @@ -3149,8 +3125,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p bool srdResource = false; amd::Memory* const* memories = reinterpret_cast(params + kernelParams.memoryObjOffset()); - const HSAILKernel& hsaKernel = - static_cast(*(kernel.getDeviceKernel(dev()))); + const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); const amd::KernelSignature& signature = kernel.signature(); ldsAddress = hsaKernel.ldsSize(); @@ -3225,10 +3200,10 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p addVmMemory(gpuMem); const void* globalAddress = *reinterpret_cast(params + desc.offset_); LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index, - desc.typeName_.c_str(), desc.name_.c_str(), - globalAddress, reinterpret_cast(gpuMem->vmAddress()), - reinterpret_cast(gpuMem->vmAddress() + gpuMem->size()), - std::this_thread::get_id()); + desc.typeName_.c_str(), desc.name_.c_str(), globalAddress, + reinterpret_cast(gpuMem->vmAddress()), + reinterpret_cast(gpuMem->vmAddress() + gpuMem->size()), + std::this_thread::get_id()); //! Check if compiler expects read/write. //! Note: SVM with subbuffers has an issue with tracking. @@ -3255,30 +3230,28 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p } if (gpuMem->desc().isDoppTexture_) { addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(), - kernel.parameters().getExecPfpaVcop()); + kernel.parameters().getExecPfpaVcop()); } } } } - } - else if (desc.type_ == T_VOID) { + } else if (desc.type_ == T_VOID) { if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { // Copy the current structure into CB1 - size_t gpuPtr = static_cast(cb(1)->UploadDataToHw( - params + desc.offset_, desc.size_)); + size_t gpuPtr = + static_cast(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_)); // Then use a pointer in aqlArgBuffer to CB1 const auto it = hsaKernel.patch().find(desc.offset_); // Patch the GPU VA address in the original arguments WriteAqlArgAt(const_cast
(params), &gpuPtr, sizeof(size_t), it->second); addVmMemory(cb(1)->ActiveMemory()); } - } - else if (desc.type_ == T_SAMPLER) { + } else if (desc.type_ == T_SAMPLER) { srdResource = true; } else if (desc.type_ == T_QUEUE) { uint32_t index = desc.info_.arrayIndex_; - const amd::DeviceQueue* queue = reinterpret_cast( - params + kernelParams.queueObjOffset())[index]; + const amd::DeviceQueue* queue = + reinterpret_cast(params + kernelParams.queueObjOffset())[index]; VirtualGPU* gpuQueue = static_cast(queue->vDev()); uint64_t vmQueue; if (dev().settings().useDeviceQueue_) { diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 13c83b3796..9e557e1f03 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -51,17 +51,18 @@ class VirtualGPU : public device::VirtualDevice { Queue(const Queue&) = delete; Queue& operator=(const Queue&) = delete; - static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object - Pal::QueueType queueType, //!< PAL queue type - uint engineIdx, //!< Select particular engine index - Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator - uint rtCU, //!< The number of reserved CUs - amd::CommandQueue::Priority priority, //!< Queue priority - uint64_t residency_limit, //!< Enables residency limit - uint max_command_buffers //!< Number of allocated command buffers - ); + static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object + Pal::QueueType queueType, //!< PAL queue type + uint engineIdx, //!< Select particular engine index + Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator + uint rtCU, //!< The number of reserved CUs + amd::CommandQueue::Priority priority, //!< Queue priority + uint64_t residency_limit, //!< Enables residency limit + uint max_command_buffers //!< Number of allocated command buffers + ); - Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers) + Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, + uint max_command_buffers) : iQueue_(nullptr), iCmdBuffs_(max_command_buffers, nullptr), iCmdFences_(max_command_buffers, nullptr), @@ -75,8 +76,7 @@ class VirtualGPU : public device::VirtualDevice { vlAlloc_(64 * Ki), residency_size_(0), residency_limit_(residency_limit), - max_command_buffers_(max_command_buffers) - { + max_command_buffers_(max_command_buffers) { vlAlloc_.Init(); } @@ -100,8 +100,7 @@ class VirtualGPU : public device::VirtualDevice { Pal::Result UpdateAppPowerProfile(); // ibReuse forces event wait without polling, to make sure event occured - template - bool waifForFence(uint cbId) const { + template bool waifForFence(uint cbId) const { Pal::Result result = Pal::Result::Success; uint64_t start; uint64_t end; @@ -138,8 +137,7 @@ class VirtualGPU : public device::VirtualDevice { //! Flushes the current command buffer to HW //! Returns ID associated with the submission - template - uint submit(bool forceFlush); + template uint submit(bool forceFlush); bool flush(); @@ -151,28 +149,28 @@ class VirtualGPU : public device::VirtualDevice { uint cmdBufId() const { return cmdBufIdCurrent_; } - Pal::IQueue* iQueue_; //!< PAL queue object - std::vector iCmdBuffs_; //!< PAL command buffers - std::vector iCmdFences_; //!< PAL fences, associated with CMD - const amd::Kernel* last_kernel_; //!< Last submitted kernel + Pal::IQueue* iQueue_; //!< PAL queue object + std::vector iCmdBuffs_; //!< PAL command buffers + std::vector iCmdFences_; //!< PAL fences, associated with CMD + const amd::Kernel* last_kernel_; //!< Last submitted kernel - private: + private: void DumpMemoryReferences() const; - const VirtualGPU& gpu_; //!< OCL virtual GPU object - Pal::IDevice* iDev_; //!< PAL device - uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions - uint cmdBufIdCurrent_; //!< Current global command buffer ID - uint cmbBufIdRetired_; //!< The last retired command buffer ID - uint cmdCnt_; //!< Counter of commands + const VirtualGPU& gpu_; //!< OCL virtual GPU object + Pal::IDevice* iDev_; //!< PAL device + uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions + uint cmdBufIdCurrent_; //!< Current global command buffer ID + uint cmbBufIdRetired_; //!< The last retired command buffer ID + uint cmdCnt_; //!< Counter of commands std::unordered_map memReferences_; - Util::VirtualLinearAllocator vlAlloc_; - std::vector palMemRefs_; - std::vector palMems_; - std::vector palDoppRefs_; - std::set sdiReferences_; - std::vector palSdiRefs_; - uint64_t residency_size_; //!< Resource residency size - uint64_t residency_limit_; //!< Enables residency limit + Util::VirtualLinearAllocator vlAlloc_; + std::vector palMemRefs_; + std::vector palMems_; + std::vector palDoppRefs_; + std::set sdiReferences_; + std::vector palSdiRefs_; + uint64_t residency_size_; //!< Resource residency size + uint64_t residency_limit_; //!< Enables residency limit uint max_command_buffers_; }; @@ -185,14 +183,14 @@ class VirtualGPU : public device::VirtualDevice { CommandBatch(amd::Command* head, //!< Command batch head const GpuEvent* events, //!< HW events on all engines TimeStamp* lastTS //!< Last TS in command batch - ) { + ) { init(head, events, lastTS); } void init(amd::Command* head, //!< Command batch head const GpuEvent* events, //!< HW events on all engines TimeStamp* lastTS //!< Last TS in command batch - ) { + ) { head_ = head; lastTS_ = lastTS; memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); @@ -202,11 +200,11 @@ class VirtualGPU : public device::VirtualDevice { //! The virtual GPU states union State { struct { - uint profiling_ : 1; //!< Profiling is enabled - uint forceWait_ : 1; //!< Forces wait in flush() - uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter - uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled - uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime + uint profiling_ : 1; //!< Profiling is enabled + uint forceWait_ : 1; //!< Forces wait in flush() + uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter + uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled + uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime }; uint value_; State() : value_(0) {} @@ -259,13 +257,13 @@ class VirtualGPU : public device::VirtualDevice { void findSplitSize(const Device& dev, //!< GPU device object uint64_t threads, //!< Total number of execution threads uint instructions //!< Number of ALU instructions - ); + ); // Returns TRUE if DMA command buffer is ready for a flush bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object uint64_t threads, //!< Total number of execution threads uint instructions //!< Number of ALU instructions - ); + ); // Returns dispatch split size uint dispatchSplitSize() const { return dispatchSplitSize_; } @@ -301,7 +299,7 @@ class VirtualGPU : public device::VirtualDevice { bool nativeMem = true, //!< Native memory objects amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command uint32_t sharedMemBytes = 0 //!< Shared memory size - ); + ); void submitNativeFn(amd::NativeFnCommand& vcmd); void submitFillMemory(amd::FillMemoryCommand& vcmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); @@ -331,20 +329,20 @@ class VirtualGPU : public device::VirtualDevice { //! Set the last known GPU event void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking bool flush = false //!< TRUE if flush is required - ); + ); //! Flush DMA buffer on the specified engine void flushDMA(uint engineID //!< Engine ID for DMA flush - ); + ); //! Wait for all engines on this Virtual GPU //! Returns TRUE if CPU didn't wait for GPU bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch - ); + ); //! Waits for the latest GPU event with a lock to prevent multiple entries void waitEventLock(CommandBatch* cb //!< Command batch - ); + ); //! Returns a resource associated with the constant buffer const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; } @@ -355,7 +353,7 @@ class VirtualGPU : public device::VirtualDevice { //! Start the command profiling void profilingBegin(amd::Command& command, //!< Command queue object bool drmProfiling = false //!< Measure DRM time - ); + ); //! End the command profiling void profilingEnd(amd::Command& command); @@ -363,11 +361,11 @@ class VirtualGPU : public device::VirtualDevice { //! Collect the profiling results bool profilingCollectResults(CommandBatch* cb, //!< Command batch const amd::Event* waitingEvent //!< Waiting event - ); + ); //! Adds a memory handle into the GSL memory array for Virtual Heap inline void addVmMemory(const Memory* memory //!< GPU memory object - ); + ); //! Adds the last submitted kernel to the queue for tracking a possible hang inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object @@ -377,7 +375,7 @@ class VirtualGPU : public device::VirtualDevice { void addDoppRef(const Memory* memory, //!< GPU memory object bool lastDoopCmd, //!< is the last submission for the pre-present primary bool pfpaDoppCmd //!< is a submission for the pre-present primary - ); + ); //! Return xfer buffer for staging operations XferBuffer& xferWrite() { return writeBuffer_; } @@ -429,7 +427,7 @@ class VirtualGPU : public device::VirtualDevice { //! Returns TRUE if virtual queue was successfully allocatted bool createVirtualQueue(uint deviceQueueSize //!< Device queue size - ); + ); EngineType engineID_; //!< Engine ID for this VirtualGPU @@ -447,7 +445,8 @@ class VirtualGPU : public device::VirtualDevice { //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const { + void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, + bool flushL2 = false) const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; @@ -508,7 +507,7 @@ class VirtualGPU : public device::VirtualDevice { //! Returns TRUE if SDMA requires overlap synchronizaiton bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer const Resource& dst //!< Destination resource for SDMA transfer - ); + ); //! Checks if RGP capture is enabled bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; } @@ -519,7 +518,7 @@ class VirtualGPU : public device::VirtualDevice { //! Creates buffer object from image amd::Memory* createBufferFromImage( amd::Memory& amdImage //! The parent image object(untiled images only) - ); + ); private: struct MemoryRange { @@ -537,14 +536,14 @@ class VirtualGPU : public device::VirtualDevice { //! Awaits a command batch with a waiting event bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait const amd::Event* waitingEvent = nullptr //!< A waiting event - ); + ); //! Detects memory dependency for HSAIL kernels and flushes caches bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution const_address params, //!< Pointer to the param's store bool nativeMem, //!< Native memory objects - size_t& ldsAddess //!< Returns LDS size, used in the kernel - ); + size_t& ldsAddess //!< Returns LDS size, used in the kernel + ); //! Common function for fill memory used by both svm Fill and non-svm fill bool fillMemory(cl_command_type type, //!< the command type @@ -553,7 +552,7 @@ class VirtualGPU : public device::VirtualDevice { size_t patternSize, //!< pattern size const amd::Coord3D& origin, //!< memory origin const amd::Coord3D& size //!< memory size for filling - ); + ); bool copyMemory(cl_command_type type, //!< the command type amd::Memory& srcMem, //!< source memory object @@ -564,35 +563,36 @@ class VirtualGPU : public device::VirtualDevice { const amd::Coord3D& size, //!< copy size const amd::BufferRect& srcRect, //!< region of source for copy const amd::BufferRect& dstRect //!< region of destination for copy - ); + ); void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command - ); + ); void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch - ); + ); void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel VirtualGPU* gpuDefQueue //!< Device queue for children execution - ); + ); - bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object - const HSAILKernel& hsaKernel, //!< Parent HSAIL object - VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue - uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue - ); + bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object + const HSAILKernel& hsaKernel, //!< Parent HSAIL object + VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue + uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue + ); - void PostDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object - const HSAILKernel& hsaKernel, //!< Parent HSAIL object - VirtualGPU* gpuDefQueue, //!< GPU default queue - uint64_t vmDefQueue, //!< VM handle to the virtual queue - uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location - GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue - ); + void PostDeviceEnqueue( + const amd::Kernel& kernel, //!< Parent amd kernel object + const HSAILKernel& hsaKernel, //!< Parent HSAIL object + VirtualGPU* gpuDefQueue, //!< GPU default queue + uint64_t vmDefQueue, //!< VM handle to the virtual queue + uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location + GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue + ); Device& gpuDevice_; //!< physical GPU device amd::Monitor execution_; //!< Lock to serialise access to all device objects @@ -605,11 +605,11 @@ class VirtualGPU : public device::VirtualDevice { DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management - std::vector pinnedMems_; //!< Pinned memory list + std::vector pinnedMems_; //!< Pinned memory list - ManagedBuffer managedBuffer_; //!< Managed write buffer - constbufs_t constBufs_; //!< constant buffers - XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads + ManagedBuffer managedBuffer_; //!< Managed write buffer + constbufs_t constBufs_; //!< constant buffers + XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads typedef std::queue CommandBatchQueue; CommandBatchQueue cbQueue_; //!< Queue of command batches @@ -617,12 +617,12 @@ class VirtualGPU : public device::VirtualDevice { uint hwRing_; //!< HW ring used on this virtual device - State state_; //!< virtual GPU current state + State state_; //!< virtual GPU current state GpuEvent events_[AllEngines]; //!< Last known GPU events - uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps - TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU - TimeStamp* profileTs_; //!< current profiling timestamp for command + uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps + TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU + TimeStamp* profileTs_; //!< current profiling timestamp for command AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header Memory* virtualQueue_; //!< Virtual device queue @@ -645,8 +645,7 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const { queues_[MainEngine]->last_kernel_ = &kernel; } -template -uint VirtualGPU::Queue::submit(bool forceFlush) { +template uint VirtualGPU::Queue::submit(bool forceFlush) { cmdCnt_++; uint id = cmdBufIdCurrent_; bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit; @@ -659,32 +658,30 @@ uint VirtualGPU::Queue::submit(bool forceFlush) { } template -inline void WriteAqlArgAt( - unsigned char* dst, //!< The write pointer to the buffer - const T* src, //!< The source pointer - uint size, //!< The size in bytes to copy - size_t offset //!< The alignment to follow while writing to the buffer +inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer + const T* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer ) { memcpy(dst + offset, src, size); } template <> -inline void WriteAqlArgAt( - unsigned char* dst, //!< The write pointer to the buffer - const uint32_t* src, //!< The source pointer - uint size, //!< The size in bytes to copy - size_t offset //!< The alignment to follow while writing to the buffer +inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer + const uint32_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer ) { *(reinterpret_cast(dst + offset)) = *src; } template <> -inline void WriteAqlArgAt( - unsigned char* dst, //!< The write pointer to the buffer - const uint64_t* src, //!< The source pointer - uint size, //!< The size in bytes to copy - size_t offset //!< The alignment to follow while writing to the buffer +inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer + const uint64_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer ) { *(reinterpret_cast(dst + offset)) = *src; } -/*@}*/} // namespace pal +/*@}*/ // namespace pal +} // namespace pal