From 5ea54a902aa22e513dc6697352ef3761ee5b9b2d Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 8 May 2019 19:22:02 -0400
Subject: [PATCH] P4 to Git Change 1780358 by gandryey@gera-win10 on 2019/05/08
18:46:22
SWDEV-79445 - OCL generic changes and code clean-up
- Run google autoformat over the PAL backend. It will allow to enable autoformat in VS for the future changes.
- No functional changes
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#78 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#93 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#73 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#60 edit
[ROCm/clr commit: 699a12bfa29aaecd187031e974b015c5176a356b]
---
.../runtime/device/pal/palappprofile.cpp | 5 +-
.../runtime/device/pal/palappprofile.hpp | 2 +-
.../clr/rocclr/runtime/device/pal/palblit.cpp | 106 +-
.../clr/rocclr/runtime/device/pal/palblit.hpp | 9 +-
.../rocclr/runtime/device/pal/palconstbuf.cpp | 41 +-
.../rocclr/runtime/device/pal/palconstbuf.hpp | 81 +-
.../rocclr/runtime/device/pal/palcounters.cpp | 12 +-
.../rocclr/runtime/device/pal/palcounters.hpp | 3 +-
.../rocclr/runtime/device/pal/paldebugger.hpp | 8 +-
.../runtime/device/pal/paldebugmanager.cpp | 2 +-
.../clr/rocclr/runtime/device/pal/paldefs.hpp | 174 +--
.../rocclr/runtime/device/pal/paldevice.cpp | 410 ++++---
.../rocclr/runtime/device/pal/paldevice.hpp | 98 +-
.../runtime/device/pal/paldeviced3d10.cpp | 18 +-
.../runtime/device/pal/paldeviced3d11.cpp | 18 +-
.../runtime/device/pal/paldeviced3d9.cpp | 18 +-
.../rocclr/runtime/device/pal/paldevicegl.cpp | 1037 +++++++++--------
.../rocclr/runtime/device/pal/palgpuopen.cpp | 296 +++--
.../rocclr/runtime/device/pal/palgpuopen.hpp | 310 +++--
.../rocclr/runtime/device/pal/palkernel.cpp | 98 +-
.../rocclr/runtime/device/pal/palkernel.hpp | 36 +-
.../rocclr/runtime/device/pal/palmemory.cpp | 30 +-
.../rocclr/runtime/device/pal/palmemory.hpp | 32 +-
.../rocclr/runtime/device/pal/palprintf.hpp | 35 +-
.../rocclr/runtime/device/pal/palprogram.cpp | 69 +-
.../rocclr/runtime/device/pal/palprogram.hpp | 59 +-
.../rocclr/runtime/device/pal/palresource.cpp | 348 +++---
.../rocclr/runtime/device/pal/palresource.hpp | 140 ++-
.../rocclr/runtime/device/pal/palsettings.cpp | 12 +-
.../rocclr/runtime/device/pal/palsettings.hpp | 107 +-
.../runtime/device/pal/paltimestamp.hpp | 5 +-
.../rocclr/runtime/device/pal/palvirtual.cpp | 443 ++++---
.../rocclr/runtime/device/pal/palvirtual.hpp | 203 ++--
33 files changed, 2119 insertions(+), 2146 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp
index e703204719..8f804911a7 100644
--- a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp
@@ -11,8 +11,9 @@ namespace pal {
AppProfile::AppProfile()
: amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
- propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
+ propertyDataMap_.insert(
+ {"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
}
-}
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp
index a337517cd6..3b7f3e441d 100644
--- a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp
@@ -20,4 +20,4 @@ class AppProfile : public amd::AppProfile {
bool enableHighPerformanceState_;
bool reportAsOCL12Device_;
};
-}
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index 524979ee97..4370f46317 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -280,8 +280,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary buffer, using CPU
- if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset,
- src, copySize, flags)) {
+ if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize,
+ flags)) {
return false;
}
@@ -296,7 +296,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
srcOffset += tmpSize;
if ((srcOffset + tmpSize) > gpu().xferWrite().MaxSize()) {
srcOffset = 0;
- flags = 0;
+ flags = 0;
} else {
flags = Resource::NoWait;
}
@@ -310,7 +310,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
- (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+ (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
} else {
@@ -335,7 +335,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
// Copy memory, using pinning
while (dstSize > 0) {
size_t tmpSize;
- // If it's the first iterarion, then readjust the copy size
+ // If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
@@ -398,7 +398,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
(dstMemory.isHostMemDirectAccess() &&
- (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+ (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
} else {
@@ -586,8 +586,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
entire, rowPitch, slicePitch);
} else {
// Use PAL path for a transfer
- result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin,
- size, gpuMem(dstMemory));
+ result =
+ gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
@@ -947,8 +947,8 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
void* param = kernel->parameters().values() + desc.offset_;
assert((desc.type_ == T_POINTER || value != NULL ||
- (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
- "not a valid local mem arg");
+ (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
+ "not a valid local mem arg");
uint32_t uint32_value = 0;
uint64_t uint64_value = 0;
@@ -957,14 +957,15 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
if ((value == NULL) || (static_cast(value) == NULL)) {
reinterpret_cast(kernel->parameters().values() +
- kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
+ kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
+ nullptr;
} else {
// convert cl_mem to amd::Memory*, return false if invalid.
- LP64_SWITCH(uint32_value, uint64_value) = static_cast((
- *static_cast(value))->virtualAddress());
+ LP64_SWITCH(uint32_value, uint64_value) =
+ static_cast((*static_cast(value))->virtualAddress());
reinterpret_cast(kernel->parameters().values() +
- kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
- *static_cast(value);
+ kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
+ *static_cast(value);
// Note: Special case for image SRD, which is 64 bit always
if (LP64_SWITCH(true, false) &&
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
@@ -1018,8 +1019,8 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
bool releaseView = false;
bool result = false;
amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_);
- bool swapLayer = (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
- dev().settings().gfx10Plus_;
+ bool swapLayer =
+ (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1078,10 +1079,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
- globalWorkSize[2] = globalWorkSize[1];
- globalWorkSize[1] = 1;
- localWorkSize[2] = localWorkSize[1];
- localWorkSize[1] = 1;
+ globalWorkSize[2] = globalWorkSize[1];
+ globalWorkSize[1] = 1;
+ localWorkSize[2] = localWorkSize[1];
+ localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1114,10 +1115,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
- dstOrg[2] = dstOrg[1];
- dstOrg[1] = 0;
- copySize[2] = copySize[1];
- copySize[1] = 1;
+ dstOrg[2] = dstOrg[1];
+ dstOrg[1] = 0;
+ copySize[2] = copySize[1];
+ copySize[1] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1338,8 +1339,8 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
bool releaseView = false;
bool result = false;
amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
- bool swapLayer = (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
- dev().settings().gfx10Plus_;
+ bool swapLayer =
+ (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1398,10 +1399,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
- globalWorkSize[2] = globalWorkSize[1];
- globalWorkSize[1] = 1;
- localWorkSize[2] = localWorkSize[1];
- localWorkSize[1] = 1;
+ globalWorkSize[2] = globalWorkSize[1];
+ globalWorkSize[1] = 1;
+ localWorkSize[2] = localWorkSize[1];
+ localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1426,10 +1427,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
- srcOrg[2] = srcOrg[1];
- srcOrg[1] = 0;
- copySize[2] = copySize[1];
- copySize[1] = 1;
+ srcOrg[2] = srcOrg[1];
+ srcOrg[1] = 0;
+ copySize[2] = copySize[1];
+ copySize[1] = 1;
}
setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
uint32_t memFmtSize = gpuMem(srcMemory).elementSize();
@@ -1570,7 +1571,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
// Program source origin
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
- dev().settings().gfx10Plus_) {
+ dev().settings().gfx10Plus_) {
srcOrg[3] = 1;
}
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
@@ -1578,7 +1579,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
// Program destinaiton origin
cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0};
if ((gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
- dev().settings().gfx10Plus_) {
+ dev().settings().gfx10Plus_) {
dstOrg[3] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1700,16 +1701,15 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
amdMemory = pinHostMemory(srcHost, pinSize, partial);
if (amdMemory == nullptr) {
// Force SW copy
- result = HostBlitManager::writeImage(srcHost, dstMemory,
- origin, size, rowPitch, slicePitch, entire);
+ result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
+ entire);
synchronize();
return result;
}
// Get device memory for this virtual device
srcMemory = dev().getGpuMemory(amdMemory);
pinned = true;
- }
- else {
+ } else {
srcMemory = &gpu().xferWrite().Acquire(pinSize);
srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait);
pinned = false;
@@ -1951,7 +1951,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
- (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+ (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
synchronize();
@@ -2002,7 +2002,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
- (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+ (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
@@ -2206,8 +2206,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
size_t localWorkSize[3];
Memory* memView = &gpuMem(memory);
amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
- bool swapLayer = (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
- dev().settings().gfx10Plus_;
+ bool swapLayer =
+ (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Program the kernels workload depending on the fill dimensions
fillType = FillImage;
@@ -2274,10 +2274,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
- globalWorkSize[2] = globalWorkSize[1];
- globalWorkSize[1] = 1;
- localWorkSize[2] = localWorkSize[1];
- localWorkSize[1] = 1;
+ globalWorkSize[2] = globalWorkSize[1];
+ globalWorkSize[1] = 1;
+ localWorkSize[2] = localWorkSize[1];
+ localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
@@ -2297,10 +2297,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0};
cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
- fillOrigin[2] = fillOrigin[1];
- fillOrigin[1] = 0;
- fillSize[2] = fillSize[1];
- fillSize[1] = 1;
+ fillOrigin[2] = fillOrigin[1];
+ fillOrigin[1] = 0;
+ fillSize[2] = fillSize[1];
+ fillSize[1] = 1;
}
setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
index fe52ac2a59..4c9769d678 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
@@ -27,7 +27,7 @@ class DmaBlitManager : public device::HostBlitManager {
//! Constructor
DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
- );
+ );
//! Destructor
virtual ~DmaBlitManager() {}
@@ -211,7 +211,7 @@ class KernelBlitManager : public DmaBlitManager {
//! Constructor
KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
- );
+ );
//! Destructor
virtual ~KernelBlitManager();
@@ -382,7 +382,7 @@ class KernelBlitManager : public DmaBlitManager {
//! Creates a program for all blit operations
bool createProgram(Device& device //!< Device object
- );
+ );
//! Creates a view memory object
Memory* createView(const Memory& parent, //!< Parent memory object
@@ -409,4 +409,5 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"fillImage", "scheduler",
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
index ccd6dfb583..3bf5be1fd0 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -11,12 +11,12 @@ namespace pal {
// ================================================================================================
ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
- : gpu_(gpu)
- , pool_(MaxNumberOfBuffers)
- , activeBuffer_(0)
- , size_(size)
- , wrtOffset_(0)
- , wrtAddress_(nullptr) {}
+ : gpu_(gpu),
+ pool_(MaxNumberOfBuffers),
+ activeBuffer_(0),
+ size_(size),
+ wrtOffset_(0),
+ wrtAddress_(nullptr) {}
// ================================================================================================
void ManagedBuffer::release() {
@@ -40,8 +40,8 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
pool_[i].buf->memRef()->gpu_ = &gpu_;
void* wrtAddress = pool_[i].buf->map(&gpu_);
if (wrtAddress == nullptr) {
- LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
- return false;
+ LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
+ return false;
}
// Make sure OCL touches every buffer in the queue to avoid delays on the first submit
uint dummy = 0;
@@ -94,15 +94,10 @@ void ManagedBuffer::pinGpuEvent() {
// ================================================================================================
ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
- : mbuf_(mbuf)
- , sys_mem_copy_(nullptr)
- , size_(size)
-{}
+ : mbuf_(mbuf), sys_mem_copy_(nullptr), size_(size) {}
// ================================================================================================
-ConstantBuffer::~ConstantBuffer() {
- amd::AlignedMemory::deallocate(sys_mem_copy_);
-}
+ConstantBuffer::~ConstantBuffer() { amd::AlignedMemory::deallocate(sys_mem_copy_); }
// ================================================================================================
bool ConstantBuffer::Create() {
@@ -118,8 +113,8 @@ bool ConstantBuffer::Create() {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
- uint64_t vm_address;
- address cpu_address = mbuf_.reserve(size, &vm_address);
+ uint64_t vm_address;
+ address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sys_mem_copy_, size);
return vm_address;
@@ -127,8 +122,8 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
- uint64_t vm_address;
- address cpu_address = mbuf_.reserve(size, &vm_address);
+ uint64_t vm_address;
+ address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sysmem, size);
return vm_address;
@@ -136,9 +131,7 @@ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const
// ================================================================================================
XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
- : buffer_view_(device, size)
- , mbuf_(mbuf)
- , size_(size) {
+ : buffer_view_(device, size), mbuf_(mbuf), size_(size) {
// Create a view for access
Resource::ViewParams params = {};
params.gpu_ = &mbuf_.gpu();
@@ -151,9 +144,9 @@ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
// ================================================================================================
Memory& XferBuffer::Acquire(uint32_t size) {
- uint64_t vm_address;
+ uint64_t vm_address;
// Reserve space in the managed buffer
- address cpu_address = mbuf_.reserve(size, &vm_address);
+ address cpu_address = mbuf_.reserve(size, &vm_address);
// Update a view for access
buffer_view_.updateView(mbuf_.activeMemory(), vm_address - mbuf_.vmAddress(), size);
return buffer_view_;
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
index c1853b0537..5a2279eec5 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
@@ -12,9 +12,9 @@ namespace pal {
class ManagedBuffer : public amd::EmbeddedObject {
public:
//! Constructor for the ConstBuffer class
- ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
- uint32_t size //!< size of the managed buffers in bytes
- );
+ ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
+ uint32_t size //!< size of the managed buffers in bytes
+ );
~ManagedBuffer() {}
//! Creates the managed buffers
@@ -50,8 +50,8 @@ class ManagedBuffer : public amd::EmbeddedObject {
private:
struct TimeStampedBuffer {
- Memory* buf;
- GpuEvent events[AllEngines];
+ Memory* buf;
+ GpuEvent events[AllEngines];
};
//! The maximum number of the managed buffers
@@ -63,21 +63,21 @@ class ManagedBuffer : public amd::EmbeddedObject {
//! Disable operator=
ManagedBuffer& operator=(const ManagedBuffer&) = delete;
- VirtualGPU& gpu_; //!< Virtual GPU object
- std::vector pool_; //!< Buffers for management
- uint32_t activeBuffer_; //!< Current active buffer
- uint32_t size_; //!< Constant buffer size
- uint32_t wrtOffset_; //!< Current write offset
- address wrtAddress_; //!< Write address in CB
+ VirtualGPU& gpu_; //!< Virtual GPU object
+ std::vector pool_; //!< Buffers for management
+ uint32_t activeBuffer_; //!< Current active buffer
+ uint32_t size_; //!< Constant buffer size
+ uint32_t wrtOffset_; //!< Current write offset
+ address wrtAddress_; //!< Write address in CB
};
//! Constant buffer
class ConstantBuffer : public amd::HeapObject {
-public:
+ public:
//! Constructor for the ConstBuffer class
ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer
- uint32_t size //!< Max size of the constant buffer
- );
+ uint32_t size //!< Max size of the constant buffer
+ );
//! Destructor for the ConstBuffer class
~ConstantBuffer();
@@ -86,18 +86,18 @@ public:
bool Create();
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
- *
- * \return GPU address for the uploaded data
- */
+ *
+ * \return GPU address for the uploaded data
+ */
uint64_t UploadDataToHw(uint32_t size //!< real data size for upload
) const;
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
- *
- * \return GPU address for the uploaded data
- */
+ *
+ * \return GPU address for the uploaded data
+ */
uint64_t UploadDataToHw(const void* sysmem, //!< Pointer to the data for upload
- uint32_t size //!< Real data size for upload
+ uint32_t size //!< Real data size for upload
) const;
//! Returns a pointer to the system memory copy for CB
@@ -106,52 +106,55 @@ public:
//! Returns active GPU buffer
Memory* ActiveMemory() const { return mbuf_.activeMemory(); }
-private:
+ private:
//! Disable copy constructor
ConstantBuffer(const ConstantBuffer&) = delete;
//! Disable operator=
ConstantBuffer& operator=(const ConstantBuffer&) = delete;
- ManagedBuffer& mbuf_; //!< Managed buffer on GPU
- address sys_mem_copy_; //!< System memory copy
- uint32_t size_; //!< Constant buffer size
+ ManagedBuffer& mbuf_; //!< Managed buffer on GPU
+ address sys_mem_copy_; //!< System memory copy
+ uint32_t size_; //!< Constant buffer size
};
//! Staging buffer
class XferBuffer : public amd::EmbeddedObject {
-public:
+ public:
//! Constructor for the ConstBuffer class
- XferBuffer(const Device& device, //!< Active GPU device
+ XferBuffer(const Device& device, //!< Active GPU device
ManagedBuffer& mbuf, //!< Managed buffer
- uint32_t size //!< Maximum size of the transfer buffer
+ uint32_t size //!< Maximum size of the transfer buffer
);
//! Destructor for the ConstBuffer class
~XferBuffer() {}
/*! \brief Acquires free memory from the managed buffer
- *
- * \return GPU memory object associated with free memory
- */
- Memory& Acquire(uint32_t size //!< data size for transfers
- );
+ *
+ * \return GPU memory object associated with free memory
+ */
+ Memory& Acquire(uint32_t size //!< data size for transfers
+ );
//! Releases memory object used in the staging transfer
void Release(Memory& mem //!< Memory object for release
- ) { buffer_view_.updateView(nullptr, 0, 0); }
+ ) {
+ buffer_view_.updateView(nullptr, 0, 0);
+ }
size_t MaxSize() const { return static_cast(size_); }
-private:
+ private:
//! Disable copy constructor
XferBuffer(const XferBuffer&) = delete;
//! Disable operator=
XferBuffer& operator=(const XferBuffer&) = delete;
- Memory buffer_view_; //!< Buffer view returned in the acquire
- ManagedBuffer& mbuf_; //!< Managed buffer on GPU
- uint32_t size_; //!< Mx staging buffer size
+ Memory buffer_view_; //!< Buffer view returned in the acquire
+ ManagedBuffer& mbuf_; //!< Managed buffer on GPU
+ uint32_t size_; //!< Mx staging buffer size
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp
index 2be9c3d50e..3af5ca0cf2 100644
--- a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp
@@ -676,12 +676,12 @@ void PerfCounter::convertInfo() {
break;
case Pal::GfxIpLevel::GfxIp10:
case Pal::GfxIpLevel::GfxIp10_1:
- if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
- auto p = gfx10BlockIdPal[info_.blockIndex_];
- info_.blockIndex_ = std::get<0>(p);
- info_.counterIndex_ = std::get<1>(p);
- }
- break;
+ if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
+ auto p = gfx10BlockIdPal[info_.blockIndex_];
+ info_.blockIndex_ = std::get<0>(p);
+ info_.counterIndex_ = std::get<1>(p);
+ }
+ break;
default:
Unimplemented();
break;
diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp
index ea55cc1600..4632c8b277 100644
--- a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp
@@ -84,8 +84,7 @@ class PerfCounter : public device::PerfCounter {
cl_uint blockIndex, //!< HW block index
cl_uint counterIndex, //!< Counter index within the block
cl_uint eventIndex) //!< Event index for profiling
- : gpuDevice_(device),
- palRef_(palRef) {
+ : gpuDevice_(device), palRef_(palRef) {
info_.blockIndex_ = blockIndex;
info_.counterIndex_ = counterIndex;
info_.eventIndex_ = eventIndex;
diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp
index cb1d4dd981..70812b4028 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp
@@ -98,10 +98,10 @@ struct HwDebugWaveAddr {
};
/*! \brief Kernel code information
-*
-* This structure contains the pointer of mapped kernel code for host access
-* and its size (in bytes)
-*/
+ *
+ * This structure contains the pointer of mapped kernel code for host access
+ * and its size (in bytes)
+ */
struct AqlCodeInfo {
amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access
uint32_t aqlCodeSize_; //! size of AQL code
diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
index 124de40991..f8fdac9d0e 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
@@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() {
void GpuDebugManager::flushCache(uint32_t mask) {
HwDbgGpuCacheMask cacheMask(mask);
- //device()->xferQueue()->flushCuCaches(cacheMask);
+ // device()->xferQueue()->flushCuCaches(cacheMask);
}
diff --git a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp
index 989efc51d9..fdd8213cee 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp
@@ -47,9 +47,9 @@ struct GpuEvent {
static constexpr uint32_t InvalidID = ((1 << 30) - 1);
struct {
- uint32_t id_ : 30; ///< Actual event id
- uint32_t modified_ : 1; ///< Resource associated with the event was modified
- uint32_t engineId_ : 1; ///< Type of the id
+ uint32_t id_ : 30; ///< Actual event id
+ uint32_t modified_ : 1; ///< Resource associated with the event was modified
+ uint32_t engineId_ : 1; ///< Type of the id
};
//! GPU event default constructor
GpuEvent() : id_(InvalidID), modified_(false), engineId_(MainEngine) {}
@@ -63,8 +63,11 @@ struct GpuEvent {
void invalidate() { id_ = InvalidID; }
// Overwrite default assign operator to preserve modified_ field
- GpuEvent& operator=(const GpuEvent& evt)
- { id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
+ GpuEvent& operator=(const GpuEvent& evt) {
+ id_ = evt.id_;
+ engineId_ = evt.engineId_;
+ return *this;
+ }
};
/*! \addtogroup PAL
@@ -113,87 +116,110 @@ const static uint HsaSamplerObjectAlignment = 16;
const static uint DeviceQueueMaskSize = 32;
struct AMDDeviceInfo {
- const char* targetName_; //!< Target name
- const char* machineTarget_; //!< Machine target
- const char* machineTargetLC_;//!< Machine target for LC
- uint simdPerCU_; //!< Number of SIMDs per CU
- uint simdWidth_; //!< Number of workitems processed per SIMD
- uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
- uint memChannelBankWidth_; //!< Memory channel bank width
- uint localMemSizePerCU_; //!< Local memory size per CU
- uint localMemBanks_; //!< Number of banks of local memory
- uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
- uint gfxipVersion_; //!< The core engine GFXIP version
- bool xnackEnabled_; //!< Enable XNACK feature
+ const char* targetName_; //!< Target name
+ const char* machineTarget_; //!< Machine target
+ const char* machineTargetLC_; //!< Machine target for LC
+ uint simdPerCU_; //!< Number of SIMDs per CU
+ uint simdWidth_; //!< Number of workitems processed per SIMD
+ uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
+ uint memChannelBankWidth_; //!< Memory channel bank width
+ uint localMemSizePerCU_; //!< Local memory size per CU
+ uint localMemBanks_; //!< Number of banks of local memory
+ uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
+ uint gfxipVersion_; //!< The core engine GFXIP version
+ bool xnackEnabled_; //!< Enable XNACK feature
};
static const AMDDeviceInfo DeviceInfo[] = {
- /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
- /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
- /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
- /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
- /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
- /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+ /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
+ /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+ /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+ /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
+ /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+ /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
- /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
- /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
- /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
- /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+ /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
+ /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+ /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+ /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
- /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
- /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
- /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
- /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+ /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
+ /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
+ /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+ /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
- /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801,false},
- /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
- /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
+ /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
+ /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
+ /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
- /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
- /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
- /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
- /* Ellesmere */ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
- /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
- /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+ /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
+ /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
+ /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+ /* Ellesmere */
+ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+ /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+ /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
};
// Ordering as per AsicRevision# in //depot/stg/pal/inc/core/palDevice.h and
// http://confluence.amd.com/pages/viewpage.action?spaceKey=ASLC&title=AMDGPU+Target+Names
static const AMDDeviceInfo Gfx9PlusSubDeviceInfo[] = {
- /* Vega10 */{"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
- /* Vega10 XNACK */{ LIGHTNING_SWITCH("gfx900","gfx901"), "gfx901", "gfx900",
- 4, 16, 1, 256, 64 * Ki, 32, 900, 901, true},
- /* Vega12 */{"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
- /* Vega12 XNACK */{ LIGHTNING_SWITCH("gfx904","gfx905"), "gfx905", "gfx904",
- 4, 16, 1, 256, 64 * Ki, 32, 904, 905, true},
- /* Vega20 */{"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
- /* Vega20 XNACK */{ LIGHTNING_SWITCH("gfx906","gfx907"), "gfx907", "gfx906",
- 4, 16, 1, 256, 64 * Ki, 32, 906, 907, true},
- /* Raven */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
- /* Raven XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
- 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
- /* Raven2 */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
- /* Raven2 XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
- 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
- /* Renoir */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
- /* Renoir XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
- 4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
- /* Navi10_A0 */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false },
- /* Navi10_A0 XNACK */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true },
- /* Navi10 */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
- /* Navi10 XNACK */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
- /* Navi10Lite */{"gfx1000", "gfx1000","gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
- /* Navi10Lite XNACK */{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
- /* Navi12 */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
- /* Navi12 XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
- /* Navi12Lite */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
- /* Navi12Lite XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
- /* Navi14 */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false },
- /* Navi14 XNACK */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true },
- /* UnknownDevice3 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
- /* UnknownDevice3 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
- /* UnknownDevice2 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
- /* UnknownDevice2 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
+ /* Vega10 */ {"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
+ /* Vega10 XNACK */
+ {LIGHTNING_SWITCH("gfx900", "gfx901"), "gfx901", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 901,
+ true},
+ /* Vega12 */ {"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
+ /* Vega12 XNACK */
+ {LIGHTNING_SWITCH("gfx904", "gfx905"), "gfx905", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 905,
+ true},
+ /* Vega20 */ {"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
+ /* Vega20 XNACK */
+ {LIGHTNING_SWITCH("gfx906", "gfx907"), "gfx907", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 907,
+ true},
+ /* Raven */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+ /* Raven XNACK */
+ {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+ true},
+ /* Raven2 */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+ /* Raven2 XNACK */
+ {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+ true},
+ /* Renoir */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+ /* Renoir XNACK */
+ {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+ true},
+ /* Navi10_A0 */
+ {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
+ /* Navi10_A0 XNACK */
+ {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
+ /* Navi10 */
+ {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
+ /* Navi10 XNACK */
+ {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
+ /* Navi10Lite */
+ {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
+ /* Navi10Lite XNACK */
+ {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
+ /* Navi12 */
+ {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
+ /* Navi12 XNACK */
+ {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
+ /* Navi12Lite */
+ {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
+ /* Navi12Lite XNACK */
+ {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
+ /* Navi14 */
+ {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false},
+ /* Navi14 XNACK */
+ {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true},
+ /* UnknownDevice3 */
+ {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
+ /* UnknownDevice3 XNACK */
+ {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
+ /* UnknownDevice2 */
+ {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
+ /* UnknownDevice2 XNACK */
+ {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
};
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index c34f2ab003..4132b19f78 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -53,15 +53,14 @@ void PalDeviceUnload() { pal::Device::tearDown(); }
namespace pal {
-Util::GenericAllocator NullDevice::allocator_;
+Util::GenericAllocator NullDevice::allocator_;
char* Device::platformObj_;
-Pal::IPlatform* Device::platform_;
+Pal::IPlatform* Device::platform_;
NullDevice::Compiler* NullDevice::compiler_;
AppProfile Device::appProfile_;
-NullDevice::NullDevice()
- : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
+NullDevice::NullDevice() : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
bool NullDevice::init() {
std::vector devices;
@@ -89,8 +88,8 @@ bool NullDevice::init() {
driverVersion = static_cast(devices[i])->info().driverVersion_;
if (driverVersion.find("PAL") != std::string::npos) {
if (static_cast(devices[i])->asicRevision() == revision) {
- foundActive = true;
- break;
+ foundActive = true;
+ break;
}
}
}
@@ -109,132 +108,130 @@ bool NullDevice::init() {
}
}
}
-#endif // defined(WITH_COMPILER_LIB)
+#endif // defined(WITH_COMPILER_LIB)
// Loop through all supported devices and create each of them
- for (uint id = 0;
- id < sizeof(Gfx9PlusSubDeviceInfo)/sizeof(AMDDeviceInfo); ++id) {
- bool foundActive = false;
- bool foundDuplicate = false;
- uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
- pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
+ for (uint id = 0; id < sizeof(Gfx9PlusSubDeviceInfo) / sizeof(AMDDeviceInfo); ++id) {
+ bool foundActive = false;
+ bool foundDuplicate = false;
+ uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
+ : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
- if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
- continue;
- }
+ if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
+ continue;
+ }
- // Loop through all active PAL devices and see if we match one
- for (uint i = 0; i < devices.size(); ++i) {
- driverVersion = static_cast(devices[i])->info().driverVersion_;
- if (driverVersion.find("PAL") != std::string::npos) {
- gfxipVersion = devices[i]->settings().useLightning_ ?
- pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
- pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
- uint gfxIpCurrent = devices[i]->settings().useLightning_ ?
- static_cast(devices[i])->hwInfo()->gfxipVersionLC_ :
- static_cast(devices[i])->hwInfo()->gfxipVersion_;
- if (gfxIpCurrent == gfxipVersion) {
- foundActive = true;
- break;
- }
+ // Loop through all active PAL devices and see if we match one
+ for (uint i = 0; i < devices.size(); ++i) {
+ driverVersion = static_cast(devices[i])->info().driverVersion_;
+ if (driverVersion.find("PAL") != std::string::npos) {
+ gfxipVersion = devices[i]->settings().useLightning_
+ ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
+ : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
+ uint gfxIpCurrent = devices[i]->settings().useLightning_
+ ? static_cast(devices[i])->hwInfo()->gfxipVersionLC_
+ : static_cast(devices[i])->hwInfo()->gfxipVersion_;
+ if (gfxIpCurrent == gfxipVersion) {
+ foundActive = true;
+ break;
}
}
+ }
- // Don't report an offline device if it's active
- if (foundActive) {
- continue;
+ // Don't report an offline device if it's active
+ if (foundActive) {
+ continue;
+ }
+
+ // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
+ // and compare them with the current entry to see if the current entry
+ // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
+ // means the current entry already has been added in the offline device list
+ for (uint j = 0; j < id; ++j) {
+ if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
+ continue;
}
-
- // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
- // and compare them with the current entry to see if the current entry
- // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
- // means the current entry already has been added in the offline device list
- for (uint j = 0; j < id; ++j) {
- if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
- continue;
- }
- if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
- pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
- foundDuplicate = true;
- break;
- }
+ if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
+ pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
+ foundDuplicate = true;
+ break;
}
+ }
- // Don't report an offline device twice
- if (foundDuplicate) {
- continue;
- }
+ // Don't report an offline device twice
+ if (foundDuplicate) {
+ continue;
+ }
- Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
- uint ipLevelMajor = round(gfxipVersion / 100);
- uint ipLevelMinor = round(gfxipVersion / 10 % 10);
- switch (ipLevelMajor) {
+ Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
+ uint ipLevelMajor = round(gfxipVersion / 100);
+ uint ipLevelMinor = round(gfxipVersion / 10 % 10);
+ switch (ipLevelMajor) {
case 9:
- ipLevel = Pal::GfxIpLevel::GfxIp9;
- break;
+ ipLevel = Pal::GfxIpLevel::GfxIp9;
+ break;
case 10:
switch (ipLevelMinor) {
- case 0:
- ipLevel = Pal::GfxIpLevel::GfxIp10;
- break;
- case 1:
- ipLevel = Pal::GfxIpLevel::GfxIp10_1;
- break;
- case 2:
- ipLevel = Pal::GfxIpLevel::GfxIp10_2;
- break;
- case 3:
- ipLevel = Pal::GfxIpLevel::GfxIp10_3;
- break;
+ case 0:
+ ipLevel = Pal::GfxIpLevel::GfxIp10;
+ break;
+ case 1:
+ ipLevel = Pal::GfxIpLevel::GfxIp10_1;
+ break;
+ case 2:
+ ipLevel = Pal::GfxIpLevel::GfxIp10_2;
+ break;
+ case 3:
+ ipLevel = Pal::GfxIpLevel::GfxIp10_3;
+ break;
}
- }
+ }
- Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
- uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
+ Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
+ uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
- switch (gfxipVersion) {
+ switch (gfxipVersion) {
case 901:
case 900:
- revision = Pal::AsicRevision::Vega10;
- break;
+ revision = Pal::AsicRevision::Vega10;
+ break;
case 903:
case 902:
- revision = Pal::AsicRevision::Raven;
- break;
+ revision = Pal::AsicRevision::Raven;
+ break;
case 905:
case 904:
- revision = Pal::AsicRevision::Vega12;
- break;
+ revision = Pal::AsicRevision::Vega12;
+ break;
case 907:
case 906:
- revision = Pal::AsicRevision::Vega20;
- break;
+ revision = Pal::AsicRevision::Vega20;
+ break;
case 1000:
- revision = Pal::AsicRevision::Navi10Lite;
- break;
+ revision = Pal::AsicRevision::Navi10Lite;
+ break;
case 1010:
- revision = Pal::AsicRevision::Navi10;
- break;
+ revision = Pal::AsicRevision::Navi10;
+ break;
case 1011:
- revision = Pal::AsicRevision::Navi12;
- break;
+ revision = Pal::AsicRevision::Navi12;
+ break;
case 1012:
- revision = Pal::AsicRevision::Navi14;
- break;
+ revision = Pal::AsicRevision::Navi14;
+ break;
case 1030:
- ShouldNotReachHere();
- break;
- }
+ ShouldNotReachHere();
+ break;
+ }
- NullDevice* dev = new NullDevice();
- if (nullptr != dev) {
- if (!dev->create(revision, ipLevel, xNACKSupported)) {
- delete dev;
- }
- else {
- dev->registerDevice();
- }
+ NullDevice* dev = new NullDevice();
+ if (nullptr != dev) {
+ if (!dev->create(revision, ipLevel, xNACKSupported)) {
+ delete dev;
+ } else {
+ dev->registerDevice();
}
+ }
}
return true;
@@ -257,10 +254,10 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) {
hwInfo_ = &DeviceInfo[static_cast(asicRevision)];
} else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) {
- subtarget = (static_cast(asicRevision_) %
- static_cast(Pal::AsicRevision::Vega10))
- << 1 | xNACKSupported;
- hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
+ subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10))
+ << 1 |
+ xNACKSupported;
+ hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
} else {
return false;
@@ -271,8 +268,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
// Report 512MB for all offline devices
Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
- heaps[Pal::GpuHeapLocal].heapSize =
- heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
+ heaps[Pal::GpuHeapLocal].heapSize = heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
Pal::WorkStationCaps wscaps = {};
@@ -295,7 +291,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64;
if (settings().useLightning_) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
// create compilation object with cache support
int gfxipMajor = hwInfo_->gfxipVersionLC_ / 100;
int gfxipMinor = hwInfo_->gfxipVersionLC_ / 10 % 10;
@@ -323,16 +319,16 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
cacheCompilation_.reset(compObj);
#endif
} else {
-#if defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
const char* library = getenv("HSA_COMPILER_LIBRARY");
- aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
- library,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- AMD_OCL_SC_LIB };
+ aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
+ library,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ AMD_OCL_SC_LIB};
// Initialize the compiler handle
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
@@ -370,9 +366,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.maxWorkItemDimensions_ = 3;
- info_.maxComputeUnits_ = settings().enableWgpMode_ ?
- palProp.gfxipProperties.shaderCore.numAvailableCus / 2 :
- palProp.gfxipProperties.shaderCore.numAvailableCus;
+ info_.maxComputeUnits_ = settings().enableWgpMode_
+ ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
+ : palProp.gfxipProperties.shaderCore.numAvailableCus;
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
@@ -427,7 +423,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
if (GPU_ADD_HBCC_SIZE) {
localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize;
} else {
- localRAM = heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
+ localRAM =
+ heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
}
info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
@@ -445,10 +442,10 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
// Find the largest heap form FB memory
if (GPU_ADD_HBCC_SIZE) {
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize),
- cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
+ cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
} else {
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].physicalHeapSize),
- cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
+ cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
}
#if defined(ATI_OS_WIN)
@@ -561,7 +558,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)",
- settings().useLightning_ ? ",LC" : ",HSAIL");
+ settings().useLightning_ ? ",LC" : ",HSAIL");
info_.profile_ = "FULL_PROFILE";
if (settings().oclVersion_ >= OpenCL20) {
@@ -640,15 +637,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
- info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32:
- palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
+ info_.wavefrontWidth_ =
+ settings().enableWave32Mode_ ? 32 : palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs;
info_.globalMemChannelBanks_ = 4;
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
info_.localMemBanks_ = hwInfo()->localMemBanks_;
- info_.gfxipVersion_ = settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
+ info_.gfxipVersion_ =
+ settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
info_.timeStampFrequency_ = 1000000;
info_.numAsyncQueues_ = numComputeRings;
@@ -661,7 +659,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.pcieDeviceId_ = palProp.deviceId;
info_.pcieRevisionId_ = palProp.revisionId;
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * hwInfo()->simdPerCU_ *
- palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
+ palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
}
}
@@ -789,8 +787,7 @@ Device::Device()
globalScratchBuf_(nullptr),
srdManager_(nullptr),
resourceList_(nullptr),
- rgpCaptureMgr_(nullptr)
- {}
+ rgpCaptureMgr_(nullptr) {}
Device::~Device() {
// remove the HW debug manager
@@ -803,8 +800,8 @@ Device::~Device() {
}
if (glb_ctx_ != nullptr) {
- glb_ctx_->release();
- glb_ctx_ = nullptr;
+ glb_ctx_->release();
+ glb_ctx_ = nullptr;
}
delete srdManager_;
@@ -878,19 +875,21 @@ bool Device::create(Pal::IDevice* device) {
ipLevel_ = properties().gfxLevel;
asicRevision_ = properties().revision;
- // XNACK flag should be set for PageMigration | IOMMUv2 Support
- uint isXNACKSupported = static_cast(properties_.gpuMemoryProperties.flags.pageMigrationEnabled
- || properties_.gpuMemoryProperties.flags.iommuv2Support);
+ // XNACK flag should be set for PageMigration | IOMMUv2 Support
+ uint isXNACKSupported =
+ static_cast(properties_.gpuMemoryProperties.flags.pageMigrationEnabled ||
+ properties_.gpuMemoryProperties.flags.iommuv2Support);
uint subtarget = isXNACKSupported;
// Update HW info for the device
if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Polaris12)) {
hwInfo_ = &DeviceInfo[static_cast(properties().revision)];
} else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) {
- // For compiler sub targets
- subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10)) << 1 |
- subtarget;
- hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
+ // For compiler sub targets
+ subtarget = (static_cast(asicRevision_) % static_cast(Pal::AsicRevision::Vega10))
+ << 1 |
+ subtarget;
+ hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
} else {
return false;
}
@@ -995,7 +994,7 @@ bool Device::create(Pal::IDevice* device) {
}
if (settings().useLightning_) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
// create compilation object with cache support
int gfxipMajor = hwInfo()->gfxipVersionLC_ / 100;
int gfxipMinor = hwInfo()->gfxipVersionLC_ / 10 % 10;
@@ -1013,7 +1012,7 @@ bool Device::create(Pal::IDevice* device) {
}
amd::CacheCompilation* compObj = new amd::CacheCompilation(
- cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
+ cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
if (!compObj) {
LogError("Unable to create cache compilation object!");
return false;
@@ -1021,18 +1020,17 @@ bool Device::create(Pal::IDevice* device) {
cacheCompilation_.reset(compObj);
#endif
- }
- else {
-#if defined(WITH_COMPILER_LIB)
+ } else {
+#if defined(WITH_COMPILER_LIB)
const char* library = getenv("HSA_COMPILER_LIBRARY");
- aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
- library,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- AMD_OCL_SC_LIB };
+ aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
+ library,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ AMD_OCL_SC_LIB};
// Initialize the compiler handle
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
@@ -1056,7 +1054,7 @@ bool Device::create(Pal::IDevice* device) {
if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) {
std::vector devices;
- uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
+ uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
// Add all PAL devices
for (uint32_t i = gStartDevice; i < numDevices; ++i) {
devices.push_back(amd::Device::devices()[i]);
@@ -1070,8 +1068,8 @@ bool Device::create(Pal::IDevice* device) {
if (glb_ctx_ == nullptr) {
return false;
}
- amd::Buffer* buf =
- new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+ amd::Buffer* buf =
+ new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
if ((buf != nullptr) && buf->create()) {
p2p_stage_ = buf;
} else {
@@ -1086,11 +1084,8 @@ bool Device::create(Pal::IDevice* device) {
// =====================================================================================================================
// Master function that handles developer callbacks from PAL.
-void PAL_STDCALL Device::PalDeveloperCallback(
- void* pPrivateData,
- const Pal::uint32 deviceIndex,
- Pal::Developer::CallbackType type,
- void* pCbData) {
+void PAL_STDCALL Device::PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
+ Pal::Developer::CallbackType type, void* pCbData) {
Device* device = static_cast(pPrivateData);
const auto& barrier = *static_cast(pCbData);
@@ -1099,7 +1094,7 @@ void PAL_STDCALL Device::PalDeveloperCallback(
VirtualGPU* gpu = nullptr;
if (pBarrierData->pCmdBuffer != nullptr) {
// Find which queue the current command buffer belongs
- for (const auto& it: device->vgpus()) {
+ for (const auto& it : device->vgpus()) {
if (it->isActiveCmd(pBarrierData->pCmdBuffer)) {
gpu = it;
break;
@@ -1112,18 +1107,18 @@ void PAL_STDCALL Device::PalDeveloperCallback(
}
switch (type) {
- case Pal::Developer::CallbackType::BarrierBegin:
- device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
- break;
- case Pal::Developer::CallbackType::BarrierEnd:
- device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
- break;
- case Pal::Developer::CallbackType::ImageBarrier:
- assert(false);
- break;
- case Pal::Developer::CallbackType::DrawDispatch:
+ case Pal::Developer::CallbackType::BarrierBegin:
+ device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
break;
- default:
+ case Pal::Developer::CallbackType::BarrierEnd:
+ device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
+ break;
+ case Pal::Developer::CallbackType::ImageBarrier:
+ assert(false);
+ break;
+ case Pal::Developer::CallbackType::DrawDispatch:
+ break;
+ default:
break;
}
}
@@ -1136,15 +1131,16 @@ bool Device::initializeHeapResources() {
// Request all compute engines
finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines =
((1 << numComputeEngines_) - 1);
- for (const auto& it: exclusiveComputeEnginesId_) {
+ for (const auto& it : exclusiveComputeEnginesId_) {
// Request real time compute engines
- finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= (1 << it.second);
+ finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |=
+ (1 << it.second);
}
// Request all SDMA engines
finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1;
if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
- return false;
+ return false;
}
heapInitComplete_ = true;
@@ -1201,7 +1197,8 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
if (queue != nullptr) {
profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
if (queue->asHostQueue() != nullptr) {
- bool interopQueue = (0 != (queue->context().info().flags_ &
+ bool interopQueue = (0 !=
+ (queue->context().info().flags_ &
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
amd::Context::D3D11DeviceKhr)));
rtCUs = queue->rtCUs();
@@ -1233,8 +1230,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
device::Program* program;
if (settings().useLightning_) {
program = new LightningProgram(*this);
- }
- else {
+ } else {
program = new HSAILProgram(*this);
}
if (program == nullptr) {
@@ -1249,9 +1245,7 @@ typedef std::unordered_map requestedDevices_t;
//! Parses the requested list of devices to be exposed to the user.
static void parseRequestedDeviceList(const char* requestedDeviceList,
- requestedDevices_t& requestedDevices,
- uint32_t numDevices) {
-
+ requestedDevices_t& requestedDevices, uint32_t numDevices) {
char* pch = strtok(const_cast(requestedDeviceList), ",");
while (pch != nullptr) {
bool deviceIdValid = true;
@@ -1263,8 +1257,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList,
break;
}
}
- if (currentDeviceIndex < 0 ||
- static_cast(currentDeviceIndex) >= numDevices) {
+ if (currentDeviceIndex < 0 || static_cast(currentDeviceIndex) >= numDevices) {
deviceIdValid = false;
}
// Get next token.
@@ -1310,9 +1303,9 @@ bool Device::init() {
// Count up all the devices in the system.
platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);
- const char* requestedDeviceList = amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ?
- HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
- : GPU_DEVICE_ORDINAL;
+ const char* requestedDeviceList = amd::IS_HIP
+ ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
+ : GPU_DEVICE_ORDINAL;
if (requestedDeviceList[0] != '\0') {
useDeviceList = true;
@@ -1465,8 +1458,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
if (result) {
// Disallow permanent map for Win7 only, since OS will move buffer to sysmem
if (IS_LINUX ||
- // Or Win10
- (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
+ // Or Win10
+ (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
void* address = gpuMemory->map(nullptr);
CondLog(address == nullptr, "PAL failed lock of persistent memory!");
}
@@ -1697,9 +1690,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
(memory->memoryType() != Resource::ExternalPhysical) &&
((owner.getHostMem() != nullptr) ||
((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
- bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
- ? owner.getHostMemRef()->size()
- : owner.getSize());
+ bool ok = memory->pinSystemMemory(
+ owner.getHostMem(),
+ (owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
//! \note: Ignore the pinning result for now
}
@@ -1720,9 +1713,9 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const {
assert((owner.asImage() != nullptr) && "View supports images only");
const amd::Image& image = *owner.asImage();
- pal::Memory* gpuImage = new pal::Image(
- *this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
- image.getImageFormat(), image.getType(), image.getMipLevels());
+ pal::Memory* gpuImage =
+ new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
+ image.getImageFormat(), image.getType(), image.getMipLevels());
// Create resource
if (nullptr != gpuImage) {
@@ -1827,19 +1820,18 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
Pal::gpusize invisible = allocedMem[Pal::GpuHeapInvisible] - resourceCache().lclCacheSize();
// Fill free memory info
- freeMemory[TotalFreeMemory] = static_cast((info().globalMemSize_ -
- (local + invisible)) / Ki);
+ freeMemory[TotalFreeMemory] =
+ static_cast((info().globalMemSize_ - (local + invisible)) / Ki);
if (invisible >= heaps_[Pal::GpuHeapInvisible].heapSize) {
invisible = 0;
- }
- else {
+ } else {
invisible = heaps_[Pal::GpuHeapInvisible].heapSize - invisible;
}
freeMemory[LargestFreeBlock] = static_cast(invisible) / Ki;
if (settings().apuSystem_) {
Pal::gpusize sysMem = allocedMem[Pal::GpuHeapGartCacheable] + allocedMem[Pal::GpuHeapGartUswc] -
- resourceCache().cacheSize() + resourceCache().lclCacheSize();
+ resourceCache().cacheSize() + resourceCache().lclCacheSize();
sysMem /= Ki;
if (sysMem >= freeMemory[TotalFreeMemory]) {
freeMemory[TotalFreeMemory] = 0;
@@ -1945,8 +1937,7 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
amd::ScopedLock lk(scratchAlloc_);
uint sb = vgpu->hwRing();
static const uint WaveSizeLimit = ((1 << 21) - 256);
- const uint threadSizeLimit =
- WaveSizeLimit / info().wavefrontWidth_;
+ const uint threadSizeLimit = WaveSizeLimit / info().wavefrontWidth_;
if (regNum > threadSizeLimit) {
LogError("Requested private memory is bigger than HW supports!");
regNum = threadSizeLimit;
@@ -1968,9 +1959,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
// Calculate the size of the scratch buffer for a queue
uint32_t numTotalCUs = info().maxComputeUnits_;
uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs;
- scratchBuf->size_ =
- static_cast(info().wavefrontWidth_) *
- scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
+ scratchBuf->size_ = static_cast(info().wavefrontWidth_) * scratchBuf->regNum_ *
+ numMaxWaves * sizeof(uint32_t);
scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_);
scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi));
// Note: Generic address space setup in HW requires 64KB alignment for scratch
@@ -2280,7 +2270,7 @@ void Device::SrdManager::freeSrdSlot(uint64_t addr) {
void Device::updateAllocedMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) const {
if (free) {
allocedMem[heap] -= size;
- } else {
+ } else {
allocedMem[heap] += size;
}
}
@@ -2337,12 +2327,18 @@ cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorag
return status;
}
-bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+ cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool result = false;
Pal::SetClockModeInput setClockMode = {};
- Pal::DeviceClockMode palClockMode = static_cast(setClockModeInput.clock_mode);
+ Pal::DeviceClockMode palClockMode =
+ static_cast(setClockModeInput.clock_mode);
setClockMode.clockMode = palClockMode;
- result = (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast(pSetClockModeOutput))))? true : false;
+ result = (Pal::Result::Success ==
+ (iDev()->SetClockMode(setClockMode,
+ reinterpret_cast(pSetClockModeOutput))))
+ ? true
+ : false;
return result;
}
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index 4528954dc2..5420c8202a 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -49,7 +49,7 @@ class NullDevice : public amd::Device {
bool create(Pal::AsicRevision asicRevision, //!< GPU ASIC revision
Pal::GfxIpLevel ipLevel, //!< GPU ip level
uint xNACKSupported = 0 //!< GPU xNACKSupported
- );
+ );
//! Instantiate a new virtual device
virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) {
@@ -111,11 +111,14 @@ class NullDevice : public amd::Device {
virtual void svmFree(void* ptr) const { return; }
void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
- void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
- virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
+ void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
+ virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+ cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+ return true;
+ }
protected:
- static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
+ static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
Pal::AsicRevision asicRevision_; //!< ASIC revision
Pal::GfxIpLevel ipLevel_; //!< Device IP level
@@ -127,7 +130,7 @@ class NullDevice : public amd::Device {
size_t maxTextureSize, //!< Maximum texture size supported in HW
uint numComputeRings, //!< Number of compute rings
uint numExclusiveComputeRings //!< Number of exclusive compute rings
- );
+ );
};
//! Forward declarations
@@ -148,26 +151,22 @@ class ThreadTrace;
#ifndef CL_FILTER_NONE
#define CL_FILTER_NONE 0x1142
#endif
-enum class ExclusiveQueueType : uint32_t {
- RealTime0 = 0,
- RealTime1,
- Medium
-};
+enum class ExclusiveQueueType : uint32_t { RealTime0 = 0, RealTime1, Medium };
class Sampler : public device::Sampler {
public:
//! Constructor
- Sampler(const Device& dev) : dev_(dev) {}
+ Sampler(const Device& dev) : dev_(dev) {}
//! Default destructor for the device memory object
virtual ~Sampler();
//! Creates a device sampler from the OCL sampler state
bool create(uint32_t oclSamplerState //!< OCL sampler state
- );
+ );
//! Creates a device sampler from the OCL sampler state
bool create(const amd::Sampler& owner //!< AMD sampler object
- );
+ );
private:
//! Disable default copy constructor
@@ -216,7 +215,7 @@ class Device : public NullDevice {
//! Releases transfer buffer
void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer
Memory& buffer //!< Transfer buffer for release
- );
+ );
//! Returns the buffer's size for transfer
size_t bufSize() const { return bufSize_; }
@@ -308,7 +307,7 @@ class Device : public NullDevice {
//! Initialise a device (i.e. all parts of the constructor that could
//! potentially fail)
bool create(Pal::IDevice* device //!< PAL device interface object
- );
+ );
//! Destructor for the physical GPU device
virtual ~Device();
@@ -346,7 +345,8 @@ class Device : public NullDevice {
virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object
const device::VirtualDevice* vdev);
- virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
+ virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+ cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
//! Retrieves information about free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
@@ -398,9 +398,10 @@ class Device : public NullDevice {
//! Returns the number of available compute rings
uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size(); }
- //! Returns the map of available exclusive compute rings with the engine index
- const std::map& exclusiveComputeEnginesId() const
- { return exclusiveComputeEnginesId_; }
+ //! Returns the map of available exclusive compute rings with the engine index
+ const std::map& exclusiveComputeEnginesId() const {
+ return exclusiveComputeEnginesId_;
+ }
//! Returns the number of available DMA engines
uint numDMAEngines() const { return numDmaEngines_; }
@@ -526,11 +527,8 @@ class Device : public NullDevice {
}
private:
- static void PAL_STDCALL PalDeveloperCallback(
- void* pPrivateData,
- const Pal::uint32 deviceIndex,
- Pal::Developer::CallbackType type,
- void* pCbData);
+ static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
+ Pal::Developer::CallbackType type, void* pCbData);
//! Disable copy constructor
Device(const Device&);
@@ -554,36 +552,37 @@ class Device : public NullDevice {
//! Allocates/reallocates the scratch buffer, according to the usage
bool allocScratch(uint regNum, //!< Number of the scratch registers
const VirtualGPU* vgpu //!< Virtual GPU for the allocation
- );
+ );
//! Interop for D3D devices
bool associateD3D11Device(void* d3d11Device //!< void* is of type ID3D11Device*
- );
+ );
bool associateD3D10Device(void* d3d10Device //!< void* is of type ID3D10Device*
- );
+ );
bool associateD3D9Device(void* d3d9Device //!< void* is of type IDirect3DDevice9*
- );
+ );
//! Interop for GL device
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
- static char* platformObj_; //!< Memory allocated for PAL platform object
- static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
+ static char* platformObj_; //!< Memory allocated for PAL platform object
+ static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
- amd::Context* context_; //!< A dummy context for internal allocations
- mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
+ amd::Context* context_; //!< A dummy context for internal allocations
+ mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
//! Lock to serialise all async ops on initialization heap operation
- mutable amd::Monitor lockForInitHeap_;
- mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
- mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
- mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
- mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
- mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
- XferBuffers* xferRead_; //!< Transfer buffers read
- std::vector* mapCache_; //!< Map cache info structure
- ResourceCache* resourceCache_; //!< Resource cache
- uint numComputeEngines_; //!< The number of available compute engines
- std::map exclusiveComputeEnginesId_;//!< The number of available compute engines
+ mutable amd::Monitor lockForInitHeap_;
+ mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
+ mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
+ mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
+ mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
+ mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
+ XferBuffers* xferRead_; //!< Transfer buffers read
+ std::vector* mapCache_; //!< Map cache info structure
+ ResourceCache* resourceCache_; //!< Resource cache
+ uint numComputeEngines_; //!< The number of available compute engines
+ std::map
+ exclusiveComputeEnginesId_; //!< The number of available compute engines
uint numDmaEngines_; //!< The number of available compute engines
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
VirtualGPU* xferQueue_; //!< Transfer queue
@@ -594,10 +593,13 @@ class Device : public NullDevice {
mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem
Pal::DeviceProperties properties_; //!< PAL device properties
Pal::IDevice* device_; //!< PAL device object
- mutable std::atomic allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
- std::unordered_set* resourceList_; //!< Active resource list
- RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
- Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
+ mutable std::atomic
+ allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
+ std::unordered_set* resourceList_; //!< Active resource list
+ RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
+ Pal::GpuMemoryHeapProperties
+ heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp
index e7d31a9d86..202fca7ef6 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp
@@ -3,19 +3,19 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D10Device(void* d3d10Device) { return false; }
-} // pal
+} // namespace pal
#else // !ATI_OS_WIN
#include
/**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -127,6 +127,6 @@ bool Device::associateD3D10Device(void* d3d10Device) {
return canInteroperate;
}
-} // pal
+} // namespace pal
#endif // !ATI_OS_WIN
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp
index 025b8ed9a5..00d852d80e 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp
@@ -3,19 +3,19 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D11Device(void* d3d11Device) { return false; }
-}
+} // namespace pal
#else // !ATI_OS_LINUX
#include
/**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -128,6 +128,6 @@ bool Device::associateD3D11Device(void* d3d11Device) {
return canInteroperate;
}
-} // pal
+} // namespace pal
#endif // !ATI_OS_LINUX
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp
index a589d2abcf..cf2ee5303c 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp
@@ -3,20 +3,20 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D9Device(void* d3dDevice) { return false; }
-}
+} // namespace pal
#else // !ATI_OS_LINUX
#include
#include
/**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -44,5 +44,5 @@ bool Device::associateD3D9Device(void* d3d9Device) {
return canInteroperate;
}
-} // pal
+} // namespace pal
#endif // !ATI_OS_WIN
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp
index ac209191ca..1d8e9df9e7 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp
@@ -45,8 +45,8 @@ typedef struct _mesa_glinterop_device_info {
#ifdef ATI_OS_LINUX
typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName);
static PFNGlxGetProcAddress pfnGlxGetProcAddress = nullptr;
-typedef int(APIENTRYP PFNMesaGLInteropGLXQueryDeviceInfo)(
- Display* dpy, GLXContext context, mesa_glinterop_device_info* out);
+typedef int(APIENTRYP PFNMesaGLInteropGLXQueryDeviceInfo)(Display* dpy, GLXContext context,
+ mesa_glinterop_device_info* out);
static PFNMesaGLInteropGLXQueryDeviceInfo pfnMesaGLInteropGLXQueryDeviceInfo = nullptr;
static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = nullptr;
static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = nullptr;
@@ -68,480 +68,579 @@ static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = nullptr;
namespace pal {
//
-/// GSL Surface Formats as per defined in cmSurfFmtEnum enum in //depot/stg/ugl/drivers/ugl/src/include/cm_enum.h
+/// GSL Surface Formats as per defined in cmSurfFmtEnum enum in
+/// //depot/stg/ugl/drivers/ugl/src/include/cm_enum.h
//
typedef enum cmSurfFmtEnum {
- CM_SURF_FMT_NOOVERRIDE = -1,
- CM_SURF_FMT_LUMINANCE8, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
- CM_SURF_FMT_LUMINANCE16, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE16F, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE32F, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_INTENSITY8, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
- CM_SURF_FMT_INTENSITY16, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
- CM_SURF_FMT_INTENSITY16F, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
- CM_SURF_FMT_INTENSITY32F, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
- CM_SURF_FMT_ALPHA8, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
- CM_SURF_FMT_ALPHA16, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
- CM_SURF_FMT_ALPHA16F, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
- CM_SURF_FMT_ALPHA32F, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_LUMINANCE8_ALPHA8, ///< Luminance Alpha, 16 bits per element packed as (@c AAAAAAAALLLLLLLL)
- CM_SURF_FMT_LUMINANCE16_ALPHA16, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE16F_ALPHA16F, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE32F_ALPHA32F, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_B2_G3_R3, ///< RGB, 8 bits per element packed as (@c RRRGGGBB)
- CM_SURF_FMT_B5_G6_R5, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB)
- CM_SURF_FMT_BGRX4, ///< RGB, 16 bits per element packed as (@c XXXXRRRRGGGGBBBB)
- CM_SURF_FMT_BGR5_X1, ///< RGB, 16 bits per element packed as (@c XRRRRRGGGGGBBBBB)
- CM_SURF_FMT_BGRX8, ///< RGB, 32 bits per element packed as (@c XXXXXXXXRRRRRRRRGGGGGGGGBBBBBBBB) - XXX unused by current driver
- CM_SURF_FMT_BGR10_X2, ///< RGB, 32 bits per element packed as (@c XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
- CM_SURF_FMT_BGRX16, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_BGRX16F, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_BGRX32F, ///< RGB, 128 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_RGBX4, ///< RGB, 16 bits per element packed as (@c XXXXBBBBGGGGRRRR)
- CM_SURF_FMT_RGB5_X1, ///< RGB, 16 bits per element packed as (@c XBBBBBGGGGGRRRRR)
- CM_SURF_FMT_RGBX8, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGB10_X2, ///< RGB, 32 bits per element packed as (@c XXBBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
- CM_SURF_FMT_RGBX16, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBX16F, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBX32F, ///< RGB, 128 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_BGRA4, ///< RGBA, 16 bits per element packed as (@c AAAARRRRGGGGBBBB)
- CM_SURF_FMT_BGR5_A1, ///< RGBA, 16 bits per element packed as (@c ARRRRRGGGGGBBBBB)
- CM_SURF_FMT_BGRA8, ///< RGBA, 32 bits per element packed as (@c AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB)
- CM_SURF_FMT_BGR10_A2, ///< RGBA, 32 bits per element packed as (@c AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
- CM_SURF_FMT_BGRA16, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_BGRA16F, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_BGRA32F, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_RGBA4, ///< RGBA, 16 bits per element packed as (@c AAAABBBBGGGGRRRR)
- CM_SURF_FMT_RGB5_A1, ///< RGBA, 16 bits per element packed as (@c ABBBBBGGGGGRRRRR)
- CM_SURF_FMT_RGBA8, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGB10_A2, ///< RGBA, 32 bits per element packed as (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
- CM_SURF_FMT_RGBA16, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBA16F, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBA32I, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBA32F, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_DUDV8, ///< DUDV 16 bits per element packed as (@c VVVVVVVVUUUUUUUU)
- CM_SURF_FMT_DXT1, ///< compressed, DXT1
- CM_SURF_FMT_DXT2_3, ///< compressed, DXT2_3
- CM_SURF_FMT_DXT4_5, ///< compressed, DXT4_5
- CM_SURF_FMT_ATI1N, ///< compressed, 1 component
- CM_SURF_FMT_ATI2N, ///< compressed, 2 component
- CM_SURF_FMT_DEPTH16, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH16F, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH24_X8, ///< depth, 32 bits per element packed as (@c XXXXXXXXDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH24F_X8, ///< depth, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH24_STEN8, ///< depth + stencil, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH24F_STEN8, ///< depth + stencil, 32 bits per element packed as (@c SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH32F_X24_STEN8, ///< depth + stencil, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH32F, ///< depth, 32 bits per element packed as (@c DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_sR11_sG11_sB10, ///< RGB, 32 bits per element packed as (@c RRRRRRRRRRRGGGGGGGGGGGBBBBBBBBBB)
- CM_SURF_FMT_sU16, ///<
- CM_SURF_FMT_sUV16, ///<
- CM_SURF_FMT_sUVWQ16, ///<
- CM_SURF_FMT_RG16, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_RG16F, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_RG32F, ///< RG, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_ABGR4, ///< RGBA, 16 bits per element packed as (@c RRRRGGGGBBBBAAAA)
- CM_SURF_FMT_A1_BGR5, ///< RGBA, 16 bits per element packed as (@c RRRRRGGGGGBBBBBA)
- CM_SURF_FMT_ABGR8, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA)
- CM_SURF_FMT_A2_BGR10, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA)
- CM_SURF_FMT_ABGR16, ///< RGBA, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_ABGR16F, ///< RGBA, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_ABGR32F, ///< RGBA, 128 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_DXT1A,
- CM_SURF_FMT_sRGB10_A2, ///< RGBA, 32 bits per element packed as signed (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
- CM_SURF_FMT_sR8, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
- CM_SURF_FMT_sRG8, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
- CM_SURF_FMT_sR32I, ///< R, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_sRG32I, ///< RG, 64 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_sRGBA32I, ///< RGBA, 128 bits per element packed as signed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_R32I, ///< R, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RG32I, ///< RG, 64 bits per element packed as (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_RG8, ///< RG8, 16 bits per element packed as (@c RRRRRRRRGGGGGGGG)
- CM_SURF_FMT_sRGBA8, ///< RGBA8, 32 bits per element packed as signed (@c RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA)
- CM_SURF_FMT_R11F_G11F_B10F, ///< RGB, 32 bits per element packed as (@c BBBBBBBBBBGGGGGGGGGGGRRRRRRRRRRR)
- CM_SURF_FMT_RGB9_E5, ///< RGB, 32 bits per element packed as (@c EEEEEBBBBBBBBBGGGGGGGGGRRRRRRRRR)
- CM_SURF_FMT_LUMINANCE_LATC1, ///< compressed LATC1
- CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, ///< compressed signed LATC1
- CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, ///< compressed LATC2
- CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, ///< compressed signed LATC2
- CM_SURF_FMT_RED_RGTC1, ///< compressed RGTC1
- CM_SURF_FMT_SIGNED_RED_RGTC1, ///< compressed signed RGTC1
- CM_SURF_FMT_RED_GREEN_RGTC2, ///< compressed RGTC2
- CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, ///< compressed signed RGTC2
- CM_SURF_FMT_R8, ///< R, 8 bits per element packed (@c RRRRRRRR)
- CM_SURF_FMT_R16, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
- CM_SURF_FMT_R16F, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
- CM_SURF_FMT_R32F, ///< R, 32 bits per element packed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_R8I, ///< R, 8 bits per element packed (@c RRRRRRRR)
- CM_SURF_FMT_sR8I, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
- CM_SURF_FMT_RG8I, ///< RG, 16 bits per element packed (@c RRRRRRRRGGGGGGGG)
- CM_SURF_FMT_sRG8I, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
- CM_SURF_FMT_R16I, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
- CM_SURF_FMT_sR16I, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RG16I, ///< RG, 32 bits per element packed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_sRG16I, ///< RG, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_RGBA32UI, ///< RGBA, 128 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_RGBX32UI, ///< RGBX, 128 bits per element packed as(@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_ALPHA32UI, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_INTENSITY32UI, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
- CM_SURF_FMT_LUMINANCE32UI, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE_ALPHA32UI, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_RGBA16UI, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBX16UI, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_ALPHA16UI, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
- CM_SURF_FMT_INTENSITY16UI, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
- CM_SURF_FMT_LUMINANCE16UI, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
- CM_SURF_FMT_LUMINANCE_ALPHA16UI, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
- CM_SURF_FMT_RGBA8UI, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGBX8UI, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_ALPHA8UI, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
- CM_SURF_FMT_INTENSITY8UI, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
- CM_SURF_FMT_LUMINANCE8UI, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
- CM_SURF_FMT_LUMINANCE_ALPHA8UI, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAALLLLLLLL)
- CM_SURF_FMT_sRGBX32I, ///< RGBX, 128 bits per element packed as(@c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
- CM_SURF_FMT_sALPHA32I, ///< Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
- CM_SURF_FMT_sINTENSITY32I, ///< Intensity, 32 bits per element packed as (@c IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
- CM_SURF_FMT_sLUMINANCE32I, ///< Luminance, 32 bits per element packed as (@c LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_sLUMINANCE_ALPHA32I, ///< Luminance Alpha, 64 bits per element packed as (@c AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
- CM_SURF_FMT_sRGBA16I, ///< RGBA, 64 bits per element packed as (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_sRGBX16I, ///< RGB, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_sALPHA16I, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
- CM_SURF_FMT_sINTENSITY16I, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
- CM_SURF_FMT_sLUMINANCE16I, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
- CM_SURF_FMT_sLUMINANCE_ALPHA16I, ///< Luminance Alpha, 32 bits per element packed as (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
- CM_SURF_FMT_sRGBA8I, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_sRGBX8I, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_sALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
- CM_SURF_FMT_sINTENSITY8I, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
- CM_SURF_FMT_sLUMINANCE8I, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
- CM_SURF_FMT_sLUMINANCE_ALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
- CM_SURF_FMT_sDXT6, ///< compressed, CM_SURF_FMT_sDXT6
- CM_SURF_FMT_DXT6, ///< compressed, CM_SURF_FMT_DXT6
- CM_SURF_FMT_DXT7, ///< compressed, DXT7
- CM_SURF_FMT_LUMINANCE8_SNORM, ///< Luminance, 8 bits per element packed as signed (@c LLLLLLLL)
- CM_SURF_FMT_LUMINANCE16_SNORM, ///< Luminance, 16 bits per element packed as signed (@c LLLLLLLLLLLLLLLL)
- CM_SURF_FMT_INTENSITY8_SNORM, ///< Intensity, 8 bits per element packed as signed (@c IIIIIIII)
- CM_SURF_FMT_INTENSITY16_SNORM, ///< Intensity, 16 bits per element packed as signed (@c IIIIIIIIIIIIIIII)
- CM_SURF_FMT_ALPHA8_SNORM, ///< Alpha, 8 bits per element packed as signed (@c AAAAAAAA)
- CM_SURF_FMT_ALPHA16_SNORM, ///< Alpha, 16 bits per element packed as signed (@c AAAAAAAAAAAAAAAA)
- CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, ///< Luminance Alpha, 16 bits per element packed as signed (@c AAAAAAAALLLLLLLL)
- CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, ///< Luminance Alpha, 32 bits per element packed as signed (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
- CM_SURF_FMT_R8_SNORM, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
- CM_SURF_FMT_R16_SNORM, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RG8_SNORM, ///< RG8, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
- CM_SURF_FMT_RG16_SNORM, ///< RG, 32 bits per element packed as signed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
- CM_SURF_FMT_RGBX8_SNORM, ///< RGB, 32 bits per element packed as signed (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGBX16_SNORM, ///< RGB, 64 bits per element packed as signed (@c XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBA8_SNORM, ///< RGBA, 32 bits per element packed as signed (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGBA16_SNORM, ///< RGBA, 64 bits per element packed as signed (@c AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGB10_A2UI, ///< RGBA, 32 bits per element packed as (@c AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
- CM_SURF_FMT_RGB32F, ///< RGB, float, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGB32I, ///< RGB, unnormalized int, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGB32UI, ///< RGB, unnormalized uint, 96 bits per element packed as (@c BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
- CM_SURF_FMT_RGBX8_SRGB, ///< RGB, 32 bits per element packed as (@c XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_RGBA8_SRGB, ///< RGBA, 32 bits per element packed as (@c AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
- CM_SURF_FMT_DXT1_SRGB, ///< compressed, DXT1
- CM_SURF_FMT_DXT1A_SRGB, ///<
- CM_SURF_FMT_DXT2_3_SRGB, ///< compressed, DXT2_3
- CM_SURF_FMT_DXT4_5_SRGB, ///< compressed, DXT4_5
- CM_SURF_FMT_DXT7_SRGB, ///< compressed, DXT7
- CM_SURF_FMT_RGB8_ETC2, ///< ETC2 compressed, RGB8 in 64 bits
- CM_SURF_FMT_SRGB8_ETC2, ///< ETC2 compressed, SRGB8 in 64 bits
- CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, RGB8 in 64 bits
- CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, sRGB8A1 in 64 bits
- CM_SURF_FMT_RGBA8_ETC2_EAC, ///< ETC2 compressed, RGBA8 in 128 bits
- CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, ///< ETC2 compressed, sRGBA8 in 128 bits
- CM_SURF_FMT_R11_EAC, ///< EAC compressed, R11 in 64 bits
- CM_SURF_FMT_SIGNED_R11_EAC, ///< EAC compressed, signed R11 in 64 bits
- CM_SURF_FMT_RG11_EAC, ///< EAC compressed, RG11 in 128 bits
- CM_SURF_FMT_SIGNED_RG11_EAC, ///< EAC compressed, signed RG11 in 128 bits
+ CM_SURF_FMT_NOOVERRIDE = -1,
+ CM_SURF_FMT_LUMINANCE8, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
+ CM_SURF_FMT_LUMINANCE16, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE16F, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE32F, ///< Luminance, 32 bits per element packed as (@c
+ ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_INTENSITY8, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
+ CM_SURF_FMT_INTENSITY16, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
+ CM_SURF_FMT_INTENSITY16F, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
+ CM_SURF_FMT_INTENSITY32F, ///< Intensity, 32 bits per element packed as (@c
+ ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
+ CM_SURF_FMT_ALPHA8, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
+ CM_SURF_FMT_ALPHA16, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_ALPHA16F, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_ALPHA32F, ///< Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_LUMINANCE8_ALPHA8, ///< Luminance Alpha, 16 bits per element packed as (@c
+ ///< AAAAAAAALLLLLLLL)
+ CM_SURF_FMT_LUMINANCE16_ALPHA16, ///< Luminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE16F_ALPHA16F, ///< Luminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE32F_ALPHA32F, ///< Luminance Alpha, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_B2_G3_R3, ///< RGB, 8 bits per element packed as (@c RRRGGGBB)
+ CM_SURF_FMT_B5_G6_R5, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB)
+ CM_SURF_FMT_BGRX4, ///< RGB, 16 bits per element packed as (@c XXXXRRRRGGGGBBBB)
+ CM_SURF_FMT_BGR5_X1, ///< RGB, 16 bits per element packed as (@c XRRRRRGGGGGBBBBB)
+ CM_SURF_FMT_BGRX8, ///< RGB, 32 bits per element packed as (@c
+ ///< XXXXXXXXRRRRRRRRGGGGGGGGBBBBBBBB) - XXX unused by current driver
+ CM_SURF_FMT_BGR10_X2, ///< RGB, 32 bits per element packed as (@c
+ ///< XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
+ CM_SURF_FMT_BGRX16, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_BGRX16F, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_BGRX32F, ///< RGB, 128 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_RGBX4, ///< RGB, 16 bits per element packed as (@c XXXXBBBBGGGGRRRR)
+ CM_SURF_FMT_RGB5_X1, ///< RGB, 16 bits per element packed as (@c XBBBBBGGGGGRRRRR)
+ CM_SURF_FMT_RGBX8, ///< RGB, 32 bits per element packed as (@c
+ ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGB10_X2, ///< RGB, 32 bits per element packed as (@c
+ ///< XXBBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
+ CM_SURF_FMT_RGBX16, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBX16F, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBX32F, ///< RGB, 128 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_BGRA4, ///< RGBA, 16 bits per element packed as (@c AAAARRRRGGGGBBBB)
+ CM_SURF_FMT_BGR5_A1, ///< RGBA, 16 bits per element packed as (@c ARRRRRGGGGGBBBBB)
+ CM_SURF_FMT_BGRA8, ///< RGBA, 32 bits per element packed as (@c
+ ///< AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB)
+ CM_SURF_FMT_BGR10_A2, ///< RGBA, 32 bits per element packed as (@c
+ ///< AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
+ CM_SURF_FMT_BGRA16, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_BGRA16F, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_BGRA32F, ///< RGBA, 128 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_RGBA4, ///< RGBA, 16 bits per element packed as (@c AAAABBBBGGGGRRRR)
+ CM_SURF_FMT_RGB5_A1, ///< RGBA, 16 bits per element packed as (@c ABBBBBGGGGGRRRRR)
+ CM_SURF_FMT_RGBA8, ///< RGBA, 32 bits per element packed as (@c
+ ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGB10_A2, ///< RGBA, 32 bits per element packed as (@c
+ ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
+ CM_SURF_FMT_RGBA16, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBA16F, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBA32I, ///< RGBA, 128 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBA32F, ///< RGBA, 128 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_DUDV8, ///< DUDV 16 bits per element packed as (@c VVVVVVVVUUUUUUUU)
+ CM_SURF_FMT_DXT1, ///< compressed, DXT1
+ CM_SURF_FMT_DXT2_3, ///< compressed, DXT2_3
+ CM_SURF_FMT_DXT4_5, ///< compressed, DXT4_5
+ CM_SURF_FMT_ATI1N, ///< compressed, 1 component
+ CM_SURF_FMT_ATI2N, ///< compressed, 2 component
+ CM_SURF_FMT_DEPTH16, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH16F, ///< depth, 16 bits per element packed as (@c DDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH24_X8, ///< depth, 32 bits per element packed as (@c
+ ///< XXXXXXXXDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH24F_X8, ///< depth, 32 bits per element packed as (@c
+ ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH24_STEN8, ///< depth + stencil, 32 bits per element packed as (@c
+ ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH24F_STEN8, ///< depth + stencil, 32 bits per element packed as (@c
+ ///< SSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH32F_X24_STEN8, ///< depth + stencil, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH32F, ///< depth, 32 bits per element packed as (@c
+ ///< DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_sR11_sG11_sB10, ///< RGB, 32 bits per element packed as (@c
+ ///< RRRRRRRRRRRGGGGGGGGGGGBBBBBBBBBB)
+ CM_SURF_FMT_sU16, ///<
+ CM_SURF_FMT_sUV16, ///<
+ CM_SURF_FMT_sUVWQ16, ///<
+ CM_SURF_FMT_RG16, ///< RG, 32 bits per element packed as (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_RG16F, ///< RG, 32 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_RG32F, ///< RG, 64 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_ABGR4, ///< RGBA, 16 bits per element packed as (@c RRRRGGGGBBBBAAAA)
+ CM_SURF_FMT_A1_BGR5, ///< RGBA, 16 bits per element packed as (@c RRRRRGGGGGBBBBBA)
+ CM_SURF_FMT_ABGR8, ///< RGBA, 32 bits per element packed as (@c
+ ///< RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA)
+ CM_SURF_FMT_A2_BGR10, ///< RGBA, 32 bits per element packed as (@c
+ ///< RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA)
+ CM_SURF_FMT_ABGR16, ///< RGBA, 64 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_ABGR16F, ///< RGBA, 64 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_ABGR32F, ///< RGBA, 128 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_DXT1A,
+ CM_SURF_FMT_sRGB10_A2, ///< RGBA, 32 bits per element packed as signed (@c
+ ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
+ CM_SURF_FMT_sR8, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
+ CM_SURF_FMT_sRG8, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
+ CM_SURF_FMT_sR32I, ///< R, 32 bits per element packed as signed (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_sRG32I, ///< RG, 64 bits per element packed as signed (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_sRGBA32I, ///< RGBA, 128 bits per element packed as signed (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_R32I, ///< R, 32 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RG32I, ///< RG, 64 bits per element packed as (@c
+ ///< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_RG8, ///< RG8, 16 bits per element packed as (@c RRRRRRRRGGGGGGGG)
+ CM_SURF_FMT_sRGBA8, ///< RGBA8, 32 bits per element packed as signed (@c
+ ///< RRRRRRRRGGGGGGGGBBBBBBBBAAAAAAAA)
+ CM_SURF_FMT_R11F_G11F_B10F, ///< RGB, 32 bits per element packed as (@c
+ ///< BBBBBBBBBBGGGGGGGGGGGRRRRRRRRRRR)
+ CM_SURF_FMT_RGB9_E5, ///< RGB, 32 bits per element packed as (@c
+ ///< EEEEEBBBBBBBBBGGGGGGGGGRRRRRRRRR)
+ CM_SURF_FMT_LUMINANCE_LATC1, ///< compressed LATC1
+ CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, ///< compressed signed LATC1
+ CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, ///< compressed LATC2
+ CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, ///< compressed signed LATC2
+ CM_SURF_FMT_RED_RGTC1, ///< compressed RGTC1
+ CM_SURF_FMT_SIGNED_RED_RGTC1, ///< compressed signed RGTC1
+ CM_SURF_FMT_RED_GREEN_RGTC2, ///< compressed RGTC2
+ CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, ///< compressed signed RGTC2
+ CM_SURF_FMT_R8, ///< R, 8 bits per element packed (@c RRRRRRRR)
+ CM_SURF_FMT_R16, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_R16F, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_R32F, ///< R, 32 bits per element packed (@c RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_R8I, ///< R, 8 bits per element packed (@c RRRRRRRR)
+ CM_SURF_FMT_sR8I, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
+ CM_SURF_FMT_RG8I, ///< RG, 16 bits per element packed (@c RRRRRRRRGGGGGGGG)
+ CM_SURF_FMT_sRG8I, ///< RG, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
+ CM_SURF_FMT_R16I, ///< R, 16 bits per element packed (@c RRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_sR16I, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RG16I, ///< RG, 32 bits per element packed (@c RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_sRG16I, ///< RG, 32 bits per element packed as signed (@c
+ ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_RGBA32UI, ///< RGBA, 128 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_RGBX32UI, ///< RGBX, 128 bits per element packed as(@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_ALPHA32UI, ///< Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_INTENSITY32UI, ///< Intensity, 32 bits per element packed as (@c
+ ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
+ CM_SURF_FMT_LUMINANCE32UI, ///< Luminance, 32 bits per element packed as (@c
+ ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE_ALPHA32UI, ///< Luminance Alpha, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_RGBA16UI, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBX16UI, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_ALPHA16UI, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_INTENSITY16UI, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
+ CM_SURF_FMT_LUMINANCE16UI, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_LUMINANCE_ALPHA16UI, ///< Luminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_RGBA8UI, ///< RGBA, 32 bits per element packed as (@c
+ ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGBX8UI, ///< RGB, 32 bits per element packed as (@c
+ ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_ALPHA8UI, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
+ CM_SURF_FMT_INTENSITY8UI, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
+ CM_SURF_FMT_LUMINANCE8UI, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
+ CM_SURF_FMT_LUMINANCE_ALPHA8UI, ///< Luminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAALLLLLLLL)
+ CM_SURF_FMT_sRGBX32I, ///< RGBX, 128 bits per element packed as(@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB)
+ CM_SURF_FMT_sALPHA32I, ///< Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_sINTENSITY32I, ///< Intensity, 32 bits per element packed as (@c
+ ///< IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII)
+ CM_SURF_FMT_sLUMINANCE32I, ///< Luminance, 32 bits per element packed as (@c
+ ///< LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_sLUMINANCE_ALPHA32I, ///< Luminance Alpha, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_sRGBA16I, ///< RGBA, 64 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_sRGBX16I, ///< RGB, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_sALPHA16I, ///< Alpha, 16 bits per element packed as (@c AAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_sINTENSITY16I, ///< Intensity, 16 bits per element packed as (@c IIIIIIIIIIIIIIII)
+ CM_SURF_FMT_sLUMINANCE16I, ///< Luminance, 16 bits per element packed as (@c LLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_sLUMINANCE_ALPHA16I, ///< Luminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_sRGBA8I, ///< RGBA, 32 bits per element packed as (@c
+ ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_sRGBX8I, ///< RGB, 32 bits per element packed as (@c
+ ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_sALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
+ CM_SURF_FMT_sINTENSITY8I, ///< Intensity, 8 bits per element packed as (@c IIIIIIII)
+ CM_SURF_FMT_sLUMINANCE8I, ///< Luminance, 8 bits per element packed as (@c LLLLLLLL)
+ CM_SURF_FMT_sLUMINANCE_ALPHA8I, ///< Alpha, 8 bits per element packed as (@c AAAAAAAA)
+ CM_SURF_FMT_sDXT6, ///< compressed, CM_SURF_FMT_sDXT6
+ CM_SURF_FMT_DXT6, ///< compressed, CM_SURF_FMT_DXT6
+ CM_SURF_FMT_DXT7, ///< compressed, DXT7
+ CM_SURF_FMT_LUMINANCE8_SNORM, ///< Luminance, 8 bits per element packed as signed (@c LLLLLLLL)
+ CM_SURF_FMT_LUMINANCE16_SNORM, ///< Luminance, 16 bits per element packed as signed (@c
+ ///< LLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_INTENSITY8_SNORM, ///< Intensity, 8 bits per element packed as signed (@c IIIIIIII)
+ CM_SURF_FMT_INTENSITY16_SNORM, ///< Intensity, 16 bits per element packed as signed (@c
+ ///< IIIIIIIIIIIIIIII)
+ CM_SURF_FMT_ALPHA8_SNORM, ///< Alpha, 8 bits per element packed as signed (@c AAAAAAAA)
+ CM_SURF_FMT_ALPHA16_SNORM, ///< Alpha, 16 bits per element packed as signed (@c
+ ///< AAAAAAAAAAAAAAAA)
+ CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, ///< Luminance Alpha, 16 bits per element packed as signed
+ ///< (@c AAAAAAAALLLLLLLL)
+ CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, ///< Luminance Alpha, 32 bits per element packed as signed
+ ///< (@c AAAAAAAAAAAAAAAALLLLLLLLLLLLLLLL)
+ CM_SURF_FMT_R8_SNORM, ///< R, 8 bits per element packed as signed (@c RRRRRRRR)
+ CM_SURF_FMT_R16_SNORM, ///< R, 16 bits per element packed as signed (@c RRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RG8_SNORM, ///< RG8, 16 bits per element packed as signed (@c RRRRRRRRGGGGGGGG)
+ CM_SURF_FMT_RG16_SNORM, ///< RG, 32 bits per element packed as signed (@c
+ ///< RRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGG)
+ CM_SURF_FMT_RGBX8_SNORM, ///< RGB, 32 bits per element packed as signed (@c
+ ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGBX16_SNORM, ///< RGB, 64 bits per element packed as signed (@c
+ ///< XXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBA8_SNORM, ///< RGBA, 32 bits per element packed as signed (@c
+ ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGBA16_SNORM, ///< RGBA, 64 bits per element packed as signed (@c
+ ///< AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGB10_A2UI, ///< RGBA, 32 bits per element packed as (@c
+ ///< AABBBBBBBBBBGGGGGGGGGGRRRRRRRRRR)
+ CM_SURF_FMT_RGB32F, ///< RGB, float, 96 bits per element packed as (@c
+ ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGB32I, ///< RGB, unnormalized int, 96 bits per element packed as (@c
+ ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGB32UI, ///< RGB, unnormalized uint, 96 bits per element packed as (@c
+ ///< BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR)
+ CM_SURF_FMT_RGBX8_SRGB, ///< RGB, 32 bits per element packed as (@c
+ ///< XXXXXXXXBBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_RGBA8_SRGB, ///< RGBA, 32 bits per element packed as (@c
+ ///< AAAAAAAABBBBBBBBGGGGGGGGRRRRRRRR)
+ CM_SURF_FMT_DXT1_SRGB, ///< compressed, DXT1
+ CM_SURF_FMT_DXT1A_SRGB, ///<
+ CM_SURF_FMT_DXT2_3_SRGB, ///< compressed, DXT2_3
+ CM_SURF_FMT_DXT4_5_SRGB, ///< compressed, DXT4_5
+ CM_SURF_FMT_DXT7_SRGB, ///< compressed, DXT7
+ CM_SURF_FMT_RGB8_ETC2, ///< ETC2 compressed, RGB8 in 64 bits
+ CM_SURF_FMT_SRGB8_ETC2, ///< ETC2 compressed, SRGB8 in 64 bits
+ CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, RGB8 in 64 bits
+ CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, ///< ETC2 compressed, sRGB8A1 in 64 bits
+ CM_SURF_FMT_RGBA8_ETC2_EAC, ///< ETC2 compressed, RGBA8 in 128 bits
+ CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, ///< ETC2 compressed, sRGBA8 in 128 bits
+ CM_SURF_FMT_R11_EAC, ///< EAC compressed, R11 in 64 bits
+ CM_SURF_FMT_SIGNED_R11_EAC, ///< EAC compressed, signed R11 in 64 bits
+ CM_SURF_FMT_RG11_EAC, ///< EAC compressed, RG11 in 128 bits
+ CM_SURF_FMT_SIGNED_RG11_EAC, ///< EAC compressed, signed RG11 in 128 bits
- CM_SURF_FMT_RGBA8_ASTC_4x4, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_5x4, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_5x5, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_6x5, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_6x6, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_8x5, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_8x6, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_8x8, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_10x5, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_10x6, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_10x8, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_10x10, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_12x10, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_RGBA8_ASTC_12x12, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_4x4, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_5x4, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_5x5, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_6x5, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_6x6, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_8x5, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_8x6, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_8x8, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_10x5, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_10x6, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_10x8, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_10x10, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_12x10, ///< ASTC compressed RGBA8 in 128 bits block
+ CM_SURF_FMT_RGBA8_ASTC_12x12, ///< ASTC compressed RGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_4x4, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_5x4, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_5x5, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_6x5, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_6x6, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_8x5, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_8x6, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_8x8, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_10x5, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_10x6, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_10x8, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_10x10, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_12x10, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_SRGBA8_ASTC_12x12, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_4x4, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_5x4, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_5x5, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_6x5, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_6x6, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_8x5, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_8x6, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_8x8, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_10x5, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_10x6, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_10x8, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_10x10, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_12x10, ///< ASTC compressed SRGBA8 in 128 bits block
+ CM_SURF_FMT_SRGBA8_ASTC_12x12, ///< ASTC compressed SRGBA8 in 128 bits block
- CM_SURF_FMT_BGR10_A2UI, ///< RGBA, 32 bits per element packed as (@c AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
- CM_SURF_FMT_A2_BGR10UI, ///< RGBA, 32 bits per element packed as (@c RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA)
- CM_SURF_FMT_A2_RGB10UI, ///< RGBA, 32 bits per element packed as (@c BBBBBBBBBBGGGGGGGGGGRRRRRRRRRRAA)
- CM_SURF_FMT_B5_G6_R5UI, ///< RGB, 16 bits per element packed as (@c BBBBBGGGGGGRRRRR)
- CM_SURF_FMT_R5_G6_B5UI, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB)
+ CM_SURF_FMT_BGR10_A2UI, ///< RGBA, 32 bits per element packed as (@c
+ ///< AARRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)
+ CM_SURF_FMT_A2_BGR10UI, ///< RGBA, 32 bits per element packed as (@c
+ ///< RRRRRRRRRRGGGGGGGGGGBBBBBBBBBBAA)
+ CM_SURF_FMT_A2_RGB10UI, ///< RGBA, 32 bits per element packed as (@c
+ ///< BBBBBBBBBBGGGGGGGGGGRRRRRRRRRRAA)
+ CM_SURF_FMT_B5_G6_R5UI, ///< RGB, 16 bits per element packed as (@c BBBBBGGGGGGRRRRR)
+ CM_SURF_FMT_R5_G6_B5UI, ///< RGB, 16 bits per element packed as (@c RRRRRGGGGGGBBBBB)
- CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, ///< depth + stencil, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_DEPTH32F_UNCLAMPED, ///< depth, 32 bits per element packed as (@c DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, ///< depth + stencil, 64 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
+ CM_SURF_FMT_DEPTH32F_UNCLAMPED, ///< depth, 32 bits per element packed as (@c
+ ///< DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD)
- CM_SURF_FMT_L8_X16_A8_SRGB, ///< Sluminance Alpha, 32 bits per element packed as (@c AAAAAAAAXXXXXXXXXXXXXXXXLLLLLLLL)
- CM_SURF_FMT_L8_X24_SRGB, ///< Sluminance, 32 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXLLLLLLLL)
+ CM_SURF_FMT_L8_X16_A8_SRGB, ///< Sluminance Alpha, 32 bits per element packed as (@c
+ ///< AAAAAAAAXXXXXXXXXXXXXXXXLLLLLLLL)
+ CM_SURF_FMT_L8_X24_SRGB, ///< Sluminance, 32 bits per element packed as (@c
+ ///< XXXXXXXXXXXXXXXXXXXXXXXXLLLLLLLL)
- CM_SURF_FMT_STENCIL8, ///< stencil, 32 bits per element packed as (@c SSSSSSSSXXXXXXXXXXXXXXXXXXXXXXXX)
+ CM_SURF_FMT_STENCIL8, ///< stencil, 32 bits per element packed as (@c
+ ///< SSSSSSSSXXXXXXXXXXXXXXXXXXXXXXXX)
- // non-native surface formats after this line, will be ignored by HWL
- // all non-native surface formats should use the _NN suffix to distinguish
- // them from potential corresponding native formats added in the future
- CM_SURF_FMT_I420_NN, ///< 4:2:0 Planar Y-U-V format
- CM_SURF_FMT_YV12_NN, ///< 4:2:0 Planar Y-V-U format
- CM_SURF_FMT_NV12_NN, ///< 4:2:0 Semi-planar Y-UV format
- CM_SURF_FMT_NV21_NN, ///< 4:2:0 Semi-planar Y-VU format
- cmSurfFmt_FIRST = CM_SURF_FMT_LUMINANCE8, ///< First surface format
- cmSurfFmt_LAST = CM_SURF_FMT_STENCIL8, ///< Last native surface format
- cmSurfFmt_LAST_NON_NATIVE = CM_SURF_FMT_NV21_NN,///< Last non-native surface format
+ // non-native surface formats after this line, will be ignored by HWL
+ // all non-native surface formats should use the _NN suffix to distinguish
+ // them from potential corresponding native formats added in the future
+ CM_SURF_FMT_I420_NN, ///< 4:2:0 Planar Y-U-V format
+ CM_SURF_FMT_YV12_NN, ///< 4:2:0 Planar Y-V-U format
+ CM_SURF_FMT_NV12_NN, ///< 4:2:0 Semi-planar Y-UV format
+ CM_SURF_FMT_NV21_NN, ///< 4:2:0 Semi-planar Y-VU format
+ cmSurfFmt_FIRST = CM_SURF_FMT_LUMINANCE8, ///< First surface format
+ cmSurfFmt_LAST = CM_SURF_FMT_STENCIL8, ///< Last native surface format
+ cmSurfFmt_LAST_NON_NATIVE = CM_SURF_FMT_NV21_NN, ///< Last non-native surface format
} cmSurfFmt;
typedef struct cmFormatXlateRec {
- cmSurfFmt raw_cmFormat;
- cl_channel_type image_channel_data_type;
- cl_channel_order image_channel_order;
+ cmSurfFmt raw_cmFormat;
+ cl_channel_type image_channel_data_type;
+ cl_channel_order image_channel_order;
} cmFormatXlateParams;
// relates full range of cm surface formats to those supported by CAL
-static const cmFormatXlateParams cmFormatXlateTable[] = {
- { CM_SURF_FMT_LUMINANCE8, CL_UNORM_INT8, CL_LUMINANCE },
- { CM_SURF_FMT_LUMINANCE16, CL_UNORM_INT16, CL_LUMINANCE },
- { CM_SURF_FMT_LUMINANCE16F, CL_HALF_FLOAT, CL_LUMINANCE },
- { CM_SURF_FMT_LUMINANCE32F, CL_FLOAT, CL_LUMINANCE },
- { CM_SURF_FMT_INTENSITY8, CL_UNORM_INT8, CL_INTENSITY },
- { CM_SURF_FMT_INTENSITY16, CL_UNORM_INT16, CL_INTENSITY },
- { CM_SURF_FMT_INTENSITY16F, CL_HALF_FLOAT, CL_INTENSITY },
- { CM_SURF_FMT_INTENSITY32F, CL_FLOAT, CL_INTENSITY },
- { CM_SURF_FMT_ALPHA8, CL_UNSIGNED_INT8, CL_A },
- { CM_SURF_FMT_ALPHA16, CL_UNORM_INT16, CL_A },
- { CM_SURF_FMT_ALPHA16F, CL_HALF_FLOAT, CL_A },
- { CM_SURF_FMT_ALPHA32F, CL_FLOAT, CL_A },
- { CM_SURF_FMT_LUMINANCE8_ALPHA8, CL_UNSIGNED_INT8, CL_RG },
- { CM_SURF_FMT_LUMINANCE16_ALPHA16, CL_UNSIGNED_INT16, CL_RG },
- { CM_SURF_FMT_LUMINANCE16F_ALPHA16F, CL_HALF_FLOAT, CL_RG },
- { CM_SURF_FMT_LUMINANCE32F_ALPHA32F, CL_FLOAT, CL_RG },
- { CM_SURF_FMT_B2_G3_R3, 500, CL_R },
- { CM_SURF_FMT_B5_G6_R5, CL_UNSIGNED_INT16, CL_RGB },
- { CM_SURF_FMT_BGRX4, 500, CL_BGRA },
- { CM_SURF_FMT_BGR5_X1, CL_UNSIGNED_INT16, CL_RGB },
- { CM_SURF_FMT_BGRX8, CL_UNORM_INT8, CL_BGRA },
- { CM_SURF_FMT_BGR10_X2, CL_UNORM_INT_101010, CL_RGB },
- { CM_SURF_FMT_BGRX16, CL_UNORM_INT16, CL_BGRA },
- { CM_SURF_FMT_BGRX16F, CL_HALF_FLOAT, CL_BGRA },
- { CM_SURF_FMT_BGRX32F, CL_FLOAT, CL_BGRA },
- { CM_SURF_FMT_RGBX4, 500, CL_RGB },
- { CM_SURF_FMT_RGB5_X1, CL_UNORM_INT16, CL_BGRA },
- { CM_SURF_FMT_RGBX8, CL_UNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_RGB10_X2, CL_UNORM_INT_101010, CL_RGBA },
- { CM_SURF_FMT_RGBX16, CL_UNORM_INT16, CL_RGBA },
- { CM_SURF_FMT_RGBX16F, CL_HALF_FLOAT, CL_RGBA },
- { CM_SURF_FMT_RGBX32F, CL_FLOAT, CL_RGBA },
- { CM_SURF_FMT_BGRA4, 500, CL_BGRA },
- { CM_SURF_FMT_BGR5_A1, CL_UNSIGNED_INT16, CL_BGRA },
- { CM_SURF_FMT_BGRA8, CL_UNORM_INT8, CL_BGRA },
- { CM_SURF_FMT_BGR10_A2, 500, CL_BGRA },
- { CM_SURF_FMT_BGRA16, CL_UNORM_INT16, CL_BGRA },
- { CM_SURF_FMT_BGRA16F, CL_UNORM_INT16, CL_BGRA },
- { CM_SURF_FMT_BGRA32F, CL_FLOAT, CL_BGRA },
- { CM_SURF_FMT_RGBA4, 500, CL_RGBA },
- { CM_SURF_FMT_RGB5_A1, CL_UNSIGNED_INT16, CL_RGBA },
- { CM_SURF_FMT_RGBA8, CL_UNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_RGB10_A2, CL_UNORM_INT_101010, CL_RGB },
- { CM_SURF_FMT_RGBA16, CL_UNORM_INT16, CL_RGBA },
- { CM_SURF_FMT_RGBA16F, CL_HALF_FLOAT, CL_RGBA },
- { CM_SURF_FMT_RGBA32I, CL_UNSIGNED_INT32, CL_RGBA },
- { CM_SURF_FMT_RGBA32F, CL_FLOAT, CL_RGBA },
- { CM_SURF_FMT_DUDV8, CL_UNSIGNED_INT8, CL_RG },
- { CM_SURF_FMT_DXT1, 500, CL_R },
- { CM_SURF_FMT_DXT2_3, 500, CL_R },
- { CM_SURF_FMT_DXT4_5, 500, CL_R },
- { CM_SURF_FMT_ATI1N, 500, CL_R },
- { CM_SURF_FMT_ATI2N, 500, CL_R },
- { CM_SURF_FMT_DEPTH16, CL_UNORM_INT16, CL_DEPTH },
- { CM_SURF_FMT_DEPTH16F, CL_HALF_FLOAT, CL_DEPTH },
- { CM_SURF_FMT_DEPTH24_X8, 500, CL_DEPTH },
- { CM_SURF_FMT_DEPTH24F_X8, 500, CL_DEPTH },
- { CM_SURF_FMT_DEPTH24_STEN8, CL_UNORM_INT24, CL_DEPTH_STENCIL },
- { CM_SURF_FMT_DEPTH24F_STEN8, 500, CL_DEPTH_STENCIL },
- { CM_SURF_FMT_DEPTH32F_X24_STEN8, CL_FLOAT, CL_DEPTH_STENCIL },
- { CM_SURF_FMT_DEPTH32F, CL_FLOAT, CL_DEPTH },
- { CM_SURF_FMT_sR11_sG11_sB10, 500, CL_R },
- { CM_SURF_FMT_sU16, CL_SNORM_INT16, CL_R },
- { CM_SURF_FMT_sUV16, CL_SNORM_INT16, CL_RG },
- { CM_SURF_FMT_sUVWQ16, CL_SNORM_INT16, CL_RGBA },
- { CM_SURF_FMT_RG16, CL_UNORM_INT16, CL_RG },
- { CM_SURF_FMT_RG16F, CL_HALF_FLOAT, CL_RG },
- { CM_SURF_FMT_RG32F, CL_FLOAT, CL_RG },
- { CM_SURF_FMT_ABGR4, 500, CL_ARGB },
- { CM_SURF_FMT_A1_BGR5, CL_UNSIGNED_INT16, CL_ARGB },
- { CM_SURF_FMT_ABGR8, CL_UNORM_INT8, CL_ARGB },
- { CM_SURF_FMT_A2_BGR10, CL_UNORM_INT_101010, CL_RGB },
- { CM_SURF_FMT_ABGR16, CL_UNORM_INT16, CL_ARGB },
- { CM_SURF_FMT_ABGR16F, CL_HALF_FLOAT, CL_ARGB },
- { CM_SURF_FMT_ABGR32F, CL_FLOAT, CL_ARGB },
- { CM_SURF_FMT_DXT1A, 500, CL_R },
- { CM_SURF_FMT_sRGB10_A2, 500, CL_RGBA },
- { CM_SURF_FMT_sR8, CL_SNORM_INT8, CL_R },
- { CM_SURF_FMT_sRG8, CL_SNORM_INT8, CL_RG },
- { CM_SURF_FMT_sR32I, CL_SIGNED_INT32, CL_R },
- { CM_SURF_FMT_sRG32I, CL_SIGNED_INT32, CL_RG },
- { CM_SURF_FMT_sRGBA32I, CL_SIGNED_INT32, CL_RGBA },
- { CM_SURF_FMT_R32I, CL_UNSIGNED_INT32, CL_R },
- { CM_SURF_FMT_RG32I, CL_UNSIGNED_INT32, CL_RG },
- { CM_SURF_FMT_RG8, CL_UNORM_INT8, CL_RG },
- { CM_SURF_FMT_sRGBA8, CL_SNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_R11F_G11F_B10F, 500, CL_RGBA },
- { CM_SURF_FMT_RGB9_E5, CL_UNORM_INT8, CL_ARGB },
- { CM_SURF_FMT_LUMINANCE_LATC1, 500, CL_RGBA },
- { CM_SURF_FMT_SIGNED_LUMINANCE_LATC1,500, CL_RGBA },
- { CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA },
- { CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA },
- { CM_SURF_FMT_RED_RGTC1, 500, CL_RGBA },
- { CM_SURF_FMT_SIGNED_RED_RGTC1, 500, CL_RGBA },
- { CM_SURF_FMT_RED_GREEN_RGTC2, 500, CL_RGBA },
- { CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2,500, CL_RGBA },
- { CM_SURF_FMT_R8, CL_UNORM_INT8, CL_R },
- { CM_SURF_FMT_R16, CL_UNORM_INT16, CL_R },
- { CM_SURF_FMT_R16F, CL_HALF_FLOAT, CL_R },
- { CM_SURF_FMT_R32F, CL_FLOAT, CL_R },
- { CM_SURF_FMT_R8I, CL_UNSIGNED_INT8, CL_R },
- { CM_SURF_FMT_sR8I, CL_SIGNED_INT8, CL_R },
- { CM_SURF_FMT_RG8I, CL_UNSIGNED_INT8, CL_RG },
- { CM_SURF_FMT_sRG8I, CL_SIGNED_INT8, CL_RG },
- { CM_SURF_FMT_R16I, CL_UNSIGNED_INT16, CL_R },
- { CM_SURF_FMT_sR16I, CL_SIGNED_INT16, CL_R },
- { CM_SURF_FMT_RG16I, CL_UNSIGNED_INT16, CL_RG },
- { CM_SURF_FMT_sRG16I, CL_SIGNED_INT16, CL_RG },
- { CM_SURF_FMT_RGBA32UI, CL_UNSIGNED_INT32, CL_RGBA },
- { CM_SURF_FMT_RGBX32UI, CL_UNSIGNED_INT32, CL_RGBA },
- { CM_SURF_FMT_ALPHA32UI, CL_UNSIGNED_INT32, CL_R },
- { CM_SURF_FMT_INTENSITY32UI, CL_UNSIGNED_INT32, CL_R },
- { CM_SURF_FMT_LUMINANCE32UI, CL_UNSIGNED_INT32, CL_R },
- { CM_SURF_FMT_LUMINANCE_ALPHA32UI, CL_UNSIGNED_INT32, CL_RG },
- { CM_SURF_FMT_RGBA16UI, CL_UNSIGNED_INT16, CL_RGBA },
- { CM_SURF_FMT_RGBX16UI, CL_UNSIGNED_INT16, CL_RGBA },
- { CM_SURF_FMT_ALPHA16UI, CL_UNSIGNED_INT16, CL_R },
- { CM_SURF_FMT_INTENSITY16UI, CL_UNSIGNED_INT16, CL_R },
- { CM_SURF_FMT_LUMINANCE16UI, CL_UNSIGNED_INT16, CL_R },
- { CM_SURF_FMT_LUMINANCE_ALPHA16UI, CL_UNSIGNED_INT32, CL_RG },
- { CM_SURF_FMT_RGBA8UI, CL_UNSIGNED_INT8, CL_RGBA },
- { CM_SURF_FMT_RGBX8UI, CL_UNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_ALPHA8UI, CL_UNSIGNED_INT8, CL_R },
- { CM_SURF_FMT_INTENSITY8UI, CL_UNSIGNED_INT8, CL_R },
- { CM_SURF_FMT_LUMINANCE8UI, CL_UNSIGNED_INT8, CL_R },
- { CM_SURF_FMT_LUMINANCE_ALPHA8UI, CL_UNSIGNED_INT8, CL_RG },
- { CM_SURF_FMT_sRGBX32I, CL_SIGNED_INT32, CL_RGBA },
- { CM_SURF_FMT_sALPHA32I, CL_SIGNED_INT32, CL_R },
- { CM_SURF_FMT_sINTENSITY32I, CL_SIGNED_INT32, CL_R },
- { CM_SURF_FMT_sLUMINANCE32I, CL_SIGNED_INT32, CL_R },
- { CM_SURF_FMT_sLUMINANCE_ALPHA32I, CL_SIGNED_INT32, CL_RG },
- { CM_SURF_FMT_sRGBA16I, CL_SIGNED_INT16, CL_RGBA },
- { CM_SURF_FMT_sRGBX16I, CL_SIGNED_INT16, CL_RGBA },
- { CM_SURF_FMT_sALPHA16I, CL_SIGNED_INT16, CL_R },
- { CM_SURF_FMT_sINTENSITY16I, CL_SIGNED_INT16, CL_R },
- { CM_SURF_FMT_sLUMINANCE16I, CL_SIGNED_INT16, CL_R },
- { CM_SURF_FMT_sLUMINANCE_ALPHA16I, CL_SIGNED_INT16, CL_RG },
- { CM_SURF_FMT_sRGBA8I, CL_SIGNED_INT8, CL_RGBA },
- { CM_SURF_FMT_sRGBX8I, CL_SIGNED_INT8, CL_RGBA },
- { CM_SURF_FMT_sALPHA8I, CL_SIGNED_INT8, CL_R },
- { CM_SURF_FMT_sINTENSITY8I, CL_SIGNED_INT8, CL_R },
- { CM_SURF_FMT_sLUMINANCE8I, CL_SIGNED_INT8, CL_R },
- { CM_SURF_FMT_sLUMINANCE_ALPHA8I, CM_SURF_FMT_sRG8I, CL_RG },
- { CM_SURF_FMT_sDXT6, 500, CL_R },
- { CM_SURF_FMT_DXT6, 500, CL_R },
- { CM_SURF_FMT_DXT7, 500, CL_R },
- { CM_SURF_FMT_LUMINANCE8_SNORM, CL_SNORM_INT8, CL_R },
- { CM_SURF_FMT_LUMINANCE16_SNORM, CL_SNORM_INT16, CL_R },
- { CM_SURF_FMT_INTENSITY8_SNORM, CL_SNORM_INT8, CL_R },
- { CM_SURF_FMT_INTENSITY16_SNORM, CL_SNORM_INT16, CL_R },
- { CM_SURF_FMT_ALPHA8_SNORM, CL_SNORM_INT8, CL_R },
- { CM_SURF_FMT_ALPHA16_SNORM, CL_SNORM_INT16, CL_R },
- { CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM,CL_SNORM_INT8, CL_RG },
- { CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM,CL_SNORM_INT16, CL_RG },
- { CM_SURF_FMT_R8_SNORM, CL_SNORM_INT8, CL_R },
- { CM_SURF_FMT_R16_SNORM, CL_SNORM_INT16, CL_R },
- { CM_SURF_FMT_RG8_SNORM, CL_SNORM_INT8, CL_RG },
- { CM_SURF_FMT_RG16_SNORM, CL_SNORM_INT16, CL_RG },
- { CM_SURF_FMT_RGBX8_SNORM, CL_SNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_RGBX16_SNORM, CL_SNORM_INT16, CL_RGBA },
- { CM_SURF_FMT_RGBA8_SNORM, CL_SNORM_INT8, CL_RGBA },
- { CM_SURF_FMT_RGBA16_SNORM, CL_SNORM_INT16, CL_RGBA },
- { CM_SURF_FMT_RGB10_A2UI, 500, CL_RGBA },
- { CM_SURF_FMT_RGB32F, 500, CL_RGBA },
- { CM_SURF_FMT_RGB32I, 500, CL_RGBA },
- { CM_SURF_FMT_RGB32UI, 500, CL_RGBA },
- { CM_SURF_FMT_RGBX8_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_DXT1_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_DXT1A_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_DXT2_3_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_DXT4_5_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_DXT7_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_RGB8_ETC2, 500, CL_RGB },
- { CM_SURF_FMT_SRGB8_ETC2, 500, CL_RGB },
- { CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, 500, CL_RGBA },
- { CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ETC2_EAC, 500, CL_RGBA },
- { CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, 500, CL_RGBA },
- { CM_SURF_FMT_R11_EAC, 500, CL_R },
- { CM_SURF_FMT_SIGNED_R11_EAC, 500, CL_R },
- { CM_SURF_FMT_RG11_EAC, 500, CL_RG },
- { CM_SURF_FMT_SIGNED_RG11_EAC, 500, CL_RG },
- { CM_SURF_FMT_RGBA8_ASTC_4x4, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_5x4, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_5x5, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_6x5, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_6x6, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_8x5, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_8x6, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_8x8, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_10x5, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_10x6, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_10x8, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_10x10, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_12x10, 500, CL_RGBA },
- { CM_SURF_FMT_RGBA8_ASTC_12x12, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_4x4, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_5x4, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_5x5, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_6x5, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_6x6, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_8x5, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_8x6, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_8x8, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_10x5, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_10x6, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_10x8, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_10x10, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_12x10, 500, CL_RGBA },
- { CM_SURF_FMT_SRGBA8_ASTC_12x12, 500, CL_RGBA },
- { CM_SURF_FMT_BGR10_A2UI, 500, CL_BGRA },
- { CM_SURF_FMT_A2_BGR10UI, 500, CL_ARGB },
- { CM_SURF_FMT_A2_RGB10UI, 500, CL_ABGR },
- { CM_SURF_FMT_B5_G6_R5UI, 500, CL_BGRA },
- { CM_SURF_FMT_R5_G6_B5UI, 500, CL_RGBA },
- { CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED,CL_UNSIGNED_INT32, CL_R },
- { CM_SURF_FMT_DEPTH32F_UNCLAMPED, CL_FLOAT, CL_R },
- { CM_SURF_FMT_L8_X16_A8_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_L8_X24_SRGB, 500, CL_RGBA },
- { CM_SURF_FMT_STENCIL8, CL_UNSIGNED_INT8, CL_R },
+static const cmFormatXlateParams cmFormatXlateTable[] = {
+ {CM_SURF_FMT_LUMINANCE8, CL_UNORM_INT8, CL_LUMINANCE},
+ {CM_SURF_FMT_LUMINANCE16, CL_UNORM_INT16, CL_LUMINANCE},
+ {CM_SURF_FMT_LUMINANCE16F, CL_HALF_FLOAT, CL_LUMINANCE},
+ {CM_SURF_FMT_LUMINANCE32F, CL_FLOAT, CL_LUMINANCE},
+ {CM_SURF_FMT_INTENSITY8, CL_UNORM_INT8, CL_INTENSITY},
+ {CM_SURF_FMT_INTENSITY16, CL_UNORM_INT16, CL_INTENSITY},
+ {CM_SURF_FMT_INTENSITY16F, CL_HALF_FLOAT, CL_INTENSITY},
+ {CM_SURF_FMT_INTENSITY32F, CL_FLOAT, CL_INTENSITY},
+ {CM_SURF_FMT_ALPHA8, CL_UNSIGNED_INT8, CL_A},
+ {CM_SURF_FMT_ALPHA16, CL_UNORM_INT16, CL_A},
+ {CM_SURF_FMT_ALPHA16F, CL_HALF_FLOAT, CL_A},
+ {CM_SURF_FMT_ALPHA32F, CL_FLOAT, CL_A},
+ {CM_SURF_FMT_LUMINANCE8_ALPHA8, CL_UNSIGNED_INT8, CL_RG},
+ {CM_SURF_FMT_LUMINANCE16_ALPHA16, CL_UNSIGNED_INT16, CL_RG},
+ {CM_SURF_FMT_LUMINANCE16F_ALPHA16F, CL_HALF_FLOAT, CL_RG},
+ {CM_SURF_FMT_LUMINANCE32F_ALPHA32F, CL_FLOAT, CL_RG},
+ {CM_SURF_FMT_B2_G3_R3, 500, CL_R},
+ {CM_SURF_FMT_B5_G6_R5, CL_UNSIGNED_INT16, CL_RGB},
+ {CM_SURF_FMT_BGRX4, 500, CL_BGRA},
+ {CM_SURF_FMT_BGR5_X1, CL_UNSIGNED_INT16, CL_RGB},
+ {CM_SURF_FMT_BGRX8, CL_UNORM_INT8, CL_BGRA},
+ {CM_SURF_FMT_BGR10_X2, CL_UNORM_INT_101010, CL_RGB},
+ {CM_SURF_FMT_BGRX16, CL_UNORM_INT16, CL_BGRA},
+ {CM_SURF_FMT_BGRX16F, CL_HALF_FLOAT, CL_BGRA},
+ {CM_SURF_FMT_BGRX32F, CL_FLOAT, CL_BGRA},
+ {CM_SURF_FMT_RGBX4, 500, CL_RGB},
+ {CM_SURF_FMT_RGB5_X1, CL_UNORM_INT16, CL_BGRA},
+ {CM_SURF_FMT_RGBX8, CL_UNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_RGB10_X2, CL_UNORM_INT_101010, CL_RGBA},
+ {CM_SURF_FMT_RGBX16, CL_UNORM_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGBX16F, CL_HALF_FLOAT, CL_RGBA},
+ {CM_SURF_FMT_RGBX32F, CL_FLOAT, CL_RGBA},
+ {CM_SURF_FMT_BGRA4, 500, CL_BGRA},
+ {CM_SURF_FMT_BGR5_A1, CL_UNSIGNED_INT16, CL_BGRA},
+ {CM_SURF_FMT_BGRA8, CL_UNORM_INT8, CL_BGRA},
+ {CM_SURF_FMT_BGR10_A2, 500, CL_BGRA},
+ {CM_SURF_FMT_BGRA16, CL_UNORM_INT16, CL_BGRA},
+ {CM_SURF_FMT_BGRA16F, CL_UNORM_INT16, CL_BGRA},
+ {CM_SURF_FMT_BGRA32F, CL_FLOAT, CL_BGRA},
+ {CM_SURF_FMT_RGBA4, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB5_A1, CL_UNSIGNED_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGBA8, CL_UNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_RGB10_A2, CL_UNORM_INT_101010, CL_RGB},
+ {CM_SURF_FMT_RGBA16, CL_UNORM_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGBA16F, CL_HALF_FLOAT, CL_RGBA},
+ {CM_SURF_FMT_RGBA32I, CL_UNSIGNED_INT32, CL_RGBA},
+ {CM_SURF_FMT_RGBA32F, CL_FLOAT, CL_RGBA},
+ {CM_SURF_FMT_DUDV8, CL_UNSIGNED_INT8, CL_RG},
+ {CM_SURF_FMT_DXT1, 500, CL_R},
+ {CM_SURF_FMT_DXT2_3, 500, CL_R},
+ {CM_SURF_FMT_DXT4_5, 500, CL_R},
+ {CM_SURF_FMT_ATI1N, 500, CL_R},
+ {CM_SURF_FMT_ATI2N, 500, CL_R},
+ {CM_SURF_FMT_DEPTH16, CL_UNORM_INT16, CL_DEPTH},
+ {CM_SURF_FMT_DEPTH16F, CL_HALF_FLOAT, CL_DEPTH},
+ {CM_SURF_FMT_DEPTH24_X8, 500, CL_DEPTH},
+ {CM_SURF_FMT_DEPTH24F_X8, 500, CL_DEPTH},
+ {CM_SURF_FMT_DEPTH24_STEN8, CL_UNORM_INT24, CL_DEPTH_STENCIL},
+ {CM_SURF_FMT_DEPTH24F_STEN8, 500, CL_DEPTH_STENCIL},
+ {CM_SURF_FMT_DEPTH32F_X24_STEN8, CL_FLOAT, CL_DEPTH_STENCIL},
+ {CM_SURF_FMT_DEPTH32F, CL_FLOAT, CL_DEPTH},
+ {CM_SURF_FMT_sR11_sG11_sB10, 500, CL_R},
+ {CM_SURF_FMT_sU16, CL_SNORM_INT16, CL_R},
+ {CM_SURF_FMT_sUV16, CL_SNORM_INT16, CL_RG},
+ {CM_SURF_FMT_sUVWQ16, CL_SNORM_INT16, CL_RGBA},
+ {CM_SURF_FMT_RG16, CL_UNORM_INT16, CL_RG},
+ {CM_SURF_FMT_RG16F, CL_HALF_FLOAT, CL_RG},
+ {CM_SURF_FMT_RG32F, CL_FLOAT, CL_RG},
+ {CM_SURF_FMT_ABGR4, 500, CL_ARGB},
+ {CM_SURF_FMT_A1_BGR5, CL_UNSIGNED_INT16, CL_ARGB},
+ {CM_SURF_FMT_ABGR8, CL_UNORM_INT8, CL_ARGB},
+ {CM_SURF_FMT_A2_BGR10, CL_UNORM_INT_101010, CL_RGB},
+ {CM_SURF_FMT_ABGR16, CL_UNORM_INT16, CL_ARGB},
+ {CM_SURF_FMT_ABGR16F, CL_HALF_FLOAT, CL_ARGB},
+ {CM_SURF_FMT_ABGR32F, CL_FLOAT, CL_ARGB},
+ {CM_SURF_FMT_DXT1A, 500, CL_R},
+ {CM_SURF_FMT_sRGB10_A2, 500, CL_RGBA},
+ {CM_SURF_FMT_sR8, CL_SNORM_INT8, CL_R},
+ {CM_SURF_FMT_sRG8, CL_SNORM_INT8, CL_RG},
+ {CM_SURF_FMT_sR32I, CL_SIGNED_INT32, CL_R},
+ {CM_SURF_FMT_sRG32I, CL_SIGNED_INT32, CL_RG},
+ {CM_SURF_FMT_sRGBA32I, CL_SIGNED_INT32, CL_RGBA},
+ {CM_SURF_FMT_R32I, CL_UNSIGNED_INT32, CL_R},
+ {CM_SURF_FMT_RG32I, CL_UNSIGNED_INT32, CL_RG},
+ {CM_SURF_FMT_RG8, CL_UNORM_INT8, CL_RG},
+ {CM_SURF_FMT_sRGBA8, CL_SNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_R11F_G11F_B10F, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB9_E5, CL_UNORM_INT8, CL_ARGB},
+ {CM_SURF_FMT_LUMINANCE_LATC1, 500, CL_RGBA},
+ {CM_SURF_FMT_SIGNED_LUMINANCE_LATC1, 500, CL_RGBA},
+ {CM_SURF_FMT_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA},
+ {CM_SURF_FMT_SIGNED_LUMINANCE_ALPHA_LATC2, 500, CL_RGBA},
+ {CM_SURF_FMT_RED_RGTC1, 500, CL_RGBA},
+ {CM_SURF_FMT_SIGNED_RED_RGTC1, 500, CL_RGBA},
+ {CM_SURF_FMT_RED_GREEN_RGTC2, 500, CL_RGBA},
+ {CM_SURF_FMT_SIGNED_RED_GREEN_RGTC2, 500, CL_RGBA},
+ {CM_SURF_FMT_R8, CL_UNORM_INT8, CL_R},
+ {CM_SURF_FMT_R16, CL_UNORM_INT16, CL_R},
+ {CM_SURF_FMT_R16F, CL_HALF_FLOAT, CL_R},
+ {CM_SURF_FMT_R32F, CL_FLOAT, CL_R},
+ {CM_SURF_FMT_R8I, CL_UNSIGNED_INT8, CL_R},
+ {CM_SURF_FMT_sR8I, CL_SIGNED_INT8, CL_R},
+ {CM_SURF_FMT_RG8I, CL_UNSIGNED_INT8, CL_RG},
+ {CM_SURF_FMT_sRG8I, CL_SIGNED_INT8, CL_RG},
+ {CM_SURF_FMT_R16I, CL_UNSIGNED_INT16, CL_R},
+ {CM_SURF_FMT_sR16I, CL_SIGNED_INT16, CL_R},
+ {CM_SURF_FMT_RG16I, CL_UNSIGNED_INT16, CL_RG},
+ {CM_SURF_FMT_sRG16I, CL_SIGNED_INT16, CL_RG},
+ {CM_SURF_FMT_RGBA32UI, CL_UNSIGNED_INT32, CL_RGBA},
+ {CM_SURF_FMT_RGBX32UI, CL_UNSIGNED_INT32, CL_RGBA},
+ {CM_SURF_FMT_ALPHA32UI, CL_UNSIGNED_INT32, CL_R},
+ {CM_SURF_FMT_INTENSITY32UI, CL_UNSIGNED_INT32, CL_R},
+ {CM_SURF_FMT_LUMINANCE32UI, CL_UNSIGNED_INT32, CL_R},
+ {CM_SURF_FMT_LUMINANCE_ALPHA32UI, CL_UNSIGNED_INT32, CL_RG},
+ {CM_SURF_FMT_RGBA16UI, CL_UNSIGNED_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGBX16UI, CL_UNSIGNED_INT16, CL_RGBA},
+ {CM_SURF_FMT_ALPHA16UI, CL_UNSIGNED_INT16, CL_R},
+ {CM_SURF_FMT_INTENSITY16UI, CL_UNSIGNED_INT16, CL_R},
+ {CM_SURF_FMT_LUMINANCE16UI, CL_UNSIGNED_INT16, CL_R},
+ {CM_SURF_FMT_LUMINANCE_ALPHA16UI, CL_UNSIGNED_INT32, CL_RG},
+ {CM_SURF_FMT_RGBA8UI, CL_UNSIGNED_INT8, CL_RGBA},
+ {CM_SURF_FMT_RGBX8UI, CL_UNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_ALPHA8UI, CL_UNSIGNED_INT8, CL_R},
+ {CM_SURF_FMT_INTENSITY8UI, CL_UNSIGNED_INT8, CL_R},
+ {CM_SURF_FMT_LUMINANCE8UI, CL_UNSIGNED_INT8, CL_R},
+ {CM_SURF_FMT_LUMINANCE_ALPHA8UI, CL_UNSIGNED_INT8, CL_RG},
+ {CM_SURF_FMT_sRGBX32I, CL_SIGNED_INT32, CL_RGBA},
+ {CM_SURF_FMT_sALPHA32I, CL_SIGNED_INT32, CL_R},
+ {CM_SURF_FMT_sINTENSITY32I, CL_SIGNED_INT32, CL_R},
+ {CM_SURF_FMT_sLUMINANCE32I, CL_SIGNED_INT32, CL_R},
+ {CM_SURF_FMT_sLUMINANCE_ALPHA32I, CL_SIGNED_INT32, CL_RG},
+ {CM_SURF_FMT_sRGBA16I, CL_SIGNED_INT16, CL_RGBA},
+ {CM_SURF_FMT_sRGBX16I, CL_SIGNED_INT16, CL_RGBA},
+ {CM_SURF_FMT_sALPHA16I, CL_SIGNED_INT16, CL_R},
+ {CM_SURF_FMT_sINTENSITY16I, CL_SIGNED_INT16, CL_R},
+ {CM_SURF_FMT_sLUMINANCE16I, CL_SIGNED_INT16, CL_R},
+ {CM_SURF_FMT_sLUMINANCE_ALPHA16I, CL_SIGNED_INT16, CL_RG},
+ {CM_SURF_FMT_sRGBA8I, CL_SIGNED_INT8, CL_RGBA},
+ {CM_SURF_FMT_sRGBX8I, CL_SIGNED_INT8, CL_RGBA},
+ {CM_SURF_FMT_sALPHA8I, CL_SIGNED_INT8, CL_R},
+ {CM_SURF_FMT_sINTENSITY8I, CL_SIGNED_INT8, CL_R},
+ {CM_SURF_FMT_sLUMINANCE8I, CL_SIGNED_INT8, CL_R},
+ {CM_SURF_FMT_sLUMINANCE_ALPHA8I, CM_SURF_FMT_sRG8I, CL_RG},
+ {CM_SURF_FMT_sDXT6, 500, CL_R},
+ {CM_SURF_FMT_DXT6, 500, CL_R},
+ {CM_SURF_FMT_DXT7, 500, CL_R},
+ {CM_SURF_FMT_LUMINANCE8_SNORM, CL_SNORM_INT8, CL_R},
+ {CM_SURF_FMT_LUMINANCE16_SNORM, CL_SNORM_INT16, CL_R},
+ {CM_SURF_FMT_INTENSITY8_SNORM, CL_SNORM_INT8, CL_R},
+ {CM_SURF_FMT_INTENSITY16_SNORM, CL_SNORM_INT16, CL_R},
+ {CM_SURF_FMT_ALPHA8_SNORM, CL_SNORM_INT8, CL_R},
+ {CM_SURF_FMT_ALPHA16_SNORM, CL_SNORM_INT16, CL_R},
+ {CM_SURF_FMT_LUMINANCE_ALPHA8_SNORM, CL_SNORM_INT8, CL_RG},
+ {CM_SURF_FMT_LUMINANCE_ALPHA16_SNORM, CL_SNORM_INT16, CL_RG},
+ {CM_SURF_FMT_R8_SNORM, CL_SNORM_INT8, CL_R},
+ {CM_SURF_FMT_R16_SNORM, CL_SNORM_INT16, CL_R},
+ {CM_SURF_FMT_RG8_SNORM, CL_SNORM_INT8, CL_RG},
+ {CM_SURF_FMT_RG16_SNORM, CL_SNORM_INT16, CL_RG},
+ {CM_SURF_FMT_RGBX8_SNORM, CL_SNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_RGBX16_SNORM, CL_SNORM_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_SNORM, CL_SNORM_INT8, CL_RGBA},
+ {CM_SURF_FMT_RGBA16_SNORM, CL_SNORM_INT16, CL_RGBA},
+ {CM_SURF_FMT_RGB10_A2UI, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB32F, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB32I, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB32UI, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBX8_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_DXT1_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_DXT1A_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_DXT2_3_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_DXT4_5_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_DXT7_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_RGB8_ETC2, 500, CL_RGB},
+ {CM_SURF_FMT_SRGB8_ETC2, 500, CL_RGB},
+ {CM_SURF_FMT_RGB8_PT_ALPHA1_ETC2, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGB8_PT_ALPHA1_ETC2, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ETC2_EAC, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGB8_ALPHA8_ETC2_EAC, 500, CL_RGBA},
+ {CM_SURF_FMT_R11_EAC, 500, CL_R},
+ {CM_SURF_FMT_SIGNED_R11_EAC, 500, CL_R},
+ {CM_SURF_FMT_RG11_EAC, 500, CL_RG},
+ {CM_SURF_FMT_SIGNED_RG11_EAC, 500, CL_RG},
+ {CM_SURF_FMT_RGBA8_ASTC_4x4, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_5x4, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_5x5, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_6x5, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_6x6, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_8x5, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_8x6, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_8x8, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_10x5, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_10x6, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_10x8, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_10x10, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_12x10, 500, CL_RGBA},
+ {CM_SURF_FMT_RGBA8_ASTC_12x12, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_4x4, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_5x4, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_5x5, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_6x5, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_6x6, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_8x5, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_8x6, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_8x8, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_10x5, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_10x6, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_10x8, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_10x10, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_12x10, 500, CL_RGBA},
+ {CM_SURF_FMT_SRGBA8_ASTC_12x12, 500, CL_RGBA},
+ {CM_SURF_FMT_BGR10_A2UI, 500, CL_BGRA},
+ {CM_SURF_FMT_A2_BGR10UI, 500, CL_ARGB},
+ {CM_SURF_FMT_A2_RGB10UI, 500, CL_ABGR},
+ {CM_SURF_FMT_B5_G6_R5UI, 500, CL_BGRA},
+ {CM_SURF_FMT_R5_G6_B5UI, 500, CL_RGBA},
+ {CM_SURF_FMT_DEPTH32F_X24_STEN8_UNCLAMPED, CL_UNSIGNED_INT32, CL_R},
+ {CM_SURF_FMT_DEPTH32F_UNCLAMPED, CL_FLOAT, CL_R},
+ {CM_SURF_FMT_L8_X16_A8_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_L8_X24_SRGB, 500, CL_RGBA},
+ {CM_SURF_FMT_STENCIL8, CL_UNSIGNED_INT8, CL_R},
};
bool Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const {
@@ -557,8 +656,8 @@ bool Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceCont
return false;
}
- pfnMesaGLInteropGLXQueryDeviceInfo = (PFNMesaGLInteropGLXQueryDeviceInfo)dlsym(
- pModule, "MesaGLInteropGLXQueryDeviceInfo");
+ pfnMesaGLInteropGLXQueryDeviceInfo =
+ (PFNMesaGLInteropGLXQueryDeviceInfo)dlsym(pModule, "MesaGLInteropGLXQueryDeviceInfo");
if (nullptr == pfnMesaGLInteropGLXQueryDeviceInfo) {
return false;
}
@@ -634,17 +733,17 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
((1 << properties().gpuIndex) == glChainBitMask);
}
#else
- GLuint glDeviceId = 0 ;
- GLuint glChainMask = 0 ;
+ GLuint glDeviceId = 0;
+ GLuint glChainMask = 0;
GLXContext ctx = static_cast(GLplatformContext);
Display* disp = static_cast(GLdeviceContext);
if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) {
- mesa_glinterop_device_info info = {};
+ mesa_glinterop_device_info info = {};
if (pfnMesaGLInteropGLXQueryDeviceInfo(disp, ctx, &info) == 0) {
- // match the adapter
- canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) &&
+ // match the adapter
+ canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) &&
(properties().pciProperties.deviceNumber == info.pci_device) &&
(properties().pciProperties.functionNumber == info.pci_function) &&
(static_cast(1 << properties().gpuIndex) == glChainMask);
@@ -749,7 +848,7 @@ bool Device::resGLAssociate(void* GLContext, uint name, uint type, Pal::OsExtern
return status;
}
assert(static_cast(hData.format) == cmFormatXlateTable[index].raw_cmFormat);
- cl_channel_type imageDataType;
+ cl_channel_type imageDataType;
imageDataType = cmFormatXlateTable[index].image_channel_data_type;
if (imageDataType == 500) {
LogError("\nGL surface is not supported by OCL\n");
@@ -819,4 +918,4 @@ bool Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) co
#endif
}
-} // pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
index 277d8dec86..ac6ee980be 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
@@ -32,34 +32,27 @@
#include "protocols/rgpServer.h"
#include "protocols/driverControlServer.h"
-namespace pal
-{
+namespace pal {
// ================================================================================================
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
- :
- device_(device),
- dev_driver_server_(platform->GetDevDriverServer()),
- user_event_(nullptr),
- num_prep_disp_(0),
- max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
- trace_gpu_mem_limit_(0),
- global_disp_count_(1), // Must start from 1 according to RGP spec
- trace_enabled_(false),
- inst_tracing_enabled_(false)
-{
+ : device_(device),
+ dev_driver_server_(platform->GetDevDriverServer()),
+ user_event_(nullptr),
+ num_prep_disp_(0),
+ max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
+ trace_gpu_mem_limit_(0),
+ global_disp_count_(1), // Must start from 1 according to RGP spec
+ trace_enabled_(false),
+ inst_tracing_enabled_(false) {
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
-RgpCaptureMgr::~RgpCaptureMgr()
-{
- DestroyRGPTracing();
-}
+RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); }
// ================================================================================================
// Creates the GPU Open Developer Mode manager class.
-RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
-{
+RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) {
RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);
if (mgr != nullptr && !mgr->Init(platform)) {
@@ -71,8 +64,7 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev
}
// ================================================================================================
-bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
-{
+bool RgpCaptureMgr::Init(Pal::IPlatform* platform) {
if (dev_driver_server_ == nullptr) {
return false;
}
@@ -105,13 +97,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
const uint32_t api_version = settings.oclVersion_;
- trace_.gpa_session_ = new GpuUtil::GpaSession(
- platform,
- device_.iDev(),
- api_version >> 4, // OCL API version major
- api_version & 0xf, // OCL API version minor
- RgpSqttInstrumentationSpecVersion,
- RgpSqttInstrumentationApiVersion);
+ trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(),
+ api_version >> 4, // OCL API version major
+ api_version & 0xf, // OCL API version minor
+ RgpSqttInstrumentationSpecVersion,
+ RgpSqttInstrumentationApiVersion);
if (trace_.gpa_session_ == nullptr) {
result = false;
@@ -119,7 +109,7 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
}
// Initialize the GPA session
- if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
+ if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
result = false;
}
@@ -133,9 +123,9 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
if (!result) {
// If we've failed to initialize tracing, permanently disable traces
if (rgp_server_ != nullptr) {
- rgp_server_->DisableTraces();
+ rgp_server_->DisableTraces();
- trace_enabled_ = false;
+ trace_enabled_ = false;
}
// Clean up if we failed
@@ -150,9 +140,8 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
-bool RgpCaptureMgr::RegisterTimedQueue(
- uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
-{
+bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
+ bool* debug_vmid) const {
bool result = true;
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
@@ -166,8 +155,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
// Register the queue with the GPA session class for timed queue operation support.
- if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
- kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
+ if (trace_.gpa_session_->RegisterTimedQueue(
+ iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
result = false;
}
@@ -175,11 +164,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
}
// ================================================================================================
-Pal::Result RgpCaptureMgr::TimedQueueSubmit(
- Pal::IQueue* queue,
- uint64_t cmdId,
- const Pal::SubmitInfo& submitInfo) const
-{
+Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
+ const Pal::SubmitInfo& submitInfo) const {
// Fill in extra meta-data information to associate the API command buffer data with
// the generated timing information.
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
@@ -205,8 +191,7 @@ Pal::Result RgpCaptureMgr::TimedQueueSubmit(
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
// This finalizes the developer driver manager.
-void RgpCaptureMgr::Finalize()
-{
+void RgpCaptureMgr::Finalize() {
// Figure out if the gfxip supports tracing. We decide tracing if there is at least one
// enumerated GPU that can support tracing. Since we don't yet know if that GPU will be
// picked as the target of an eventual VkDevice, this check is imperfect.
@@ -215,8 +200,8 @@ void RgpCaptureMgr::Finalize()
bool hw_support_tracing = false;
if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
- if (GpuSupportsTracing(device_.properties(), device_.settings())) {
- hw_support_tracing = true;
+ if (GpuSupportsTracing(device_.properties(), device_.settings())) {
+ hw_support_tracing = true;
}
}
@@ -234,20 +219,18 @@ void RgpCaptureMgr::Finalize()
// ================================================================================================
// Waits for the driver to be resumed if it's currently paused.
-void RgpCaptureMgr::WaitForDriverResume()
-{
- auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
+void RgpCaptureMgr::WaitForDriverResume() {
+ auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
- assert(pDriverControlServer != nullptr);
+ assert(pDriverControlServer != nullptr);
- pDriverControlServer->WaitForDriverResume();
+ pDriverControlServer->WaitForDriverResume();
}
// ================================================================================================
// Called before a swap chain presents. This signals a frame-end boundary and
// is used to coordinate RGP trace start/stop.
-void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
-{
+void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
if (rgp_server_->TracesEnabled()) {
// If there's currently a trace running, submit the trace-end command buffer
if (trace_.status_ == TraceStatus::Running) {
@@ -257,8 +240,7 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
Pal::Result res = EndRGPHardwareTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == res) {
// continue until we find the right queue...
- }
- else if (Pal::Result::Success == res) {
+ } else if (Pal::Result::Success == res) {
trace_.sqtt_disp_count_ = 0;
} else {
FinishRGPTrace(gpu, true);
@@ -272,43 +254,42 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
// Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
- //Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
- //assert(result == Pal::Result::Success);
+ // Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
+ // assert(result == Pal::Result::Success);
}
}
}
// ================================================================================================
-Pal::Result RgpCaptureMgr::CheckForTraceResults()
-{
+Pal::Result RgpCaptureMgr::CheckForTraceResults() {
assert(trace_.status_ == TraceStatus::WaitingForResults);
Pal::Result result = Pal::Result::NotReady;
// Check if trace results are ready
- if (trace_.gpa_session_->IsReady() && // GPA session is ready
- (trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
+ if (trace_.gpa_session_->IsReady() && // GPA session is ready
+ (trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
{
bool success = false;
// Fetch required trace data size from GPA session
size_t traceDataSize = 0;
- void* pTraceData = nullptr;
+ void* pTraceData = nullptr;
trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);
// Allocate memory for trace data
if (traceDataSize > 0) {
- pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
+ pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
}
if (pTraceData != nullptr) {
// Get trace data from GPA session
if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
- Pal::Result::Success) {
+ Pal::Result::Success) {
// Transmit trace data to anyone who's listening
- auto devResult = rgp_server_->WriteTraceData(
- static_cast(pTraceData), traceDataSize);
+ auto devResult =
+ rgp_server_->WriteTraceData(static_cast(pTraceData), traceDataSize);
success = (devResult == DevDriver::Result::Success);
}
@@ -317,7 +298,7 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
}
if (success) {
- result = Pal::Result::Success;
+ result = Pal::Result::Success;
}
}
@@ -327,9 +308,8 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
// ================================================================================================
// Called after a swap chain presents. This signals a (next) frame-begin boundary and is
// used to coordinate RGP trace start/stop.
-void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
- size_t x, size_t y, size_t z)
-{
+void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y,
+ size_t z) {
// Wait for the driver to be resumed in case it's been paused.
WaitForDriverResume();
@@ -347,8 +327,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
}
}
}
- }
- else if (trace_.status_ == TraceStatus::Preparing) {
+ } else if (trace_.status_ == TraceStatus::Preparing) {
// Wait some number of "preparation frames" before starting the trace in order to get enough
// timer samples to sync CPU/GPU clock domains.
trace_.prepared_disp_count_++;
@@ -370,7 +349,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
// Check if we're ending a trace waiting for SQTT to turn off.
// If SQTT has turned off, end the trace
else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
- Pal::Result result = Pal::Result::Success;
+ Pal::Result result = Pal::Result::Success;
if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
result = EndRGPTrace(gpu);
@@ -401,14 +380,17 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
if (kernel.prog().isInternal()) {
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
- RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
- RgpSqttMarkerEventType::CmdCopyImageToBuffer,
- RgpSqttMarkerEventType::CmdCopyBufferToImage,
- RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
- RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
- RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
- RgpSqttMarkerEventType::CmdScheduler
- };
+ RgpSqttMarkerEventType::CmdCopyImage,
+ RgpSqttMarkerEventType::CmdCopyImage,
+ RgpSqttMarkerEventType::CmdCopyImageToBuffer,
+ RgpSqttMarkerEventType::CmdCopyBufferToImage,
+ RgpSqttMarkerEventType::CmdCopyBuffer,
+ RgpSqttMarkerEventType::CmdCopyBuffer,
+ RgpSqttMarkerEventType::CmdCopyBuffer,
+ RgpSqttMarkerEventType::CmdCopyBuffer,
+ RgpSqttMarkerEventType::CmdFillBuffer,
+ RgpSqttMarkerEventType::CmdFillImage,
+ RgpSqttMarkerEventType::CmdScheduler};
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
if (kernel.name().compare(BlitName[i]) == 0) {
apiEvent = ApiEvents[i];
@@ -418,8 +400,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
}
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
// Write disaptch marker
- WriteEventWithDimsMarker(gpu, apiEvent,
- static_cast(x), static_cast(y), static_cast(z));
+ WriteEventWithDimsMarker(gpu, apiEvent, static_cast(x), static_cast(y),
+ static_cast(z));
}
}
@@ -428,11 +410,11 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
// ================================================================================================
// This function starts preparing for an RGP trace. Preparation involves some N frames of
-// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
+// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock
+// domains.
//
// This function transitions from the Idle state to the Preparing state.
-Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Idle);
// We can only trace using a single device at a time currently, so recreate RGP trace
@@ -441,32 +423,32 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
const auto traceParameters = rgp_server_->QueryTraceParameters();
- num_prep_disp_ = traceParameters.captureStartIndex;
+ num_prep_disp_ = traceParameters.captureStartIndex;
uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
// Validate if the captured dispatches are in the range
if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
max_sqtt_disp_ = capture_disp;
}
- trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
+ trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;
// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
- result = Pal::Result::ErrorUnknown;
+ result = Pal::Result::ErrorUnknown;
}
// Tell the GPA session class we're starting a trace
if (result == Pal::Result::Success) {
GpuUtil::GpaSessionBeginInfo info = {};
- info.flags.enableQueueTiming = true;// trace_.queueTimingEnabled;
+ info.flags.enableQueueTiming = true; // trace_.queueTimingEnabled;
result = trace_.gpa_session_->Begin(info);
}
trace_.prepared_disp_count_ = 0;
- trace_.sqtt_disp_count_ = 0;
+ trace_.sqtt_disp_count_ = 0;
// Sample the timing clocks prior to starting a trace.
if (result == Pal::Result::Success) {
@@ -476,7 +458,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
if (result == Pal::Result::Success) {
// Remember which queue started the trace
trace_.prepare_queue_ = gpu;
- trace_.begin_queue_ = nullptr;
+ trace_.begin_queue_ = nullptr;
trace_.status_ = TraceStatus::Preparing;
} else {
@@ -497,8 +479,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
// the "begin trace" information command buffer.
//
// This function transitions from the Preparing state to the Running state.
-Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Preparing);
assert(trace_enabled_);
@@ -526,8 +507,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// Fill GPU commands
gpu->eventBegin(MainEngine);
- trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
- gpu->queue(MainEngine).iCmd(), sampleConfig);
+ trace_.gpa_sample_id_ =
+ trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig);
gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
}
@@ -540,7 +521,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// Make the trace active and remember which queue started it
if (result == Pal::Result::Success) {
- trace_.status_ = TraceStatus::Running;
+ trace_.status_ = TraceStatus::Running;
trace_.begin_queue_ = gpu;
}
@@ -551,8 +532,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// This function submits the command buffer to stop SQTT tracing. Full tracing still continues.
//
// This function transitions from the Running state to the WaitingForSqtt state.
-Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Running);
Pal::Result result = Pal::Result::Success;
@@ -593,8 +573,7 @@ Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
// This function ends a running RGP trace.
//
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
-Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::WaitingForSqtt);
Pal::Result result = Pal::Result::Success;
@@ -629,8 +608,7 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
// ================================================================================================
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
// It frees any dependent resources.
-void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
-{
+void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
if (trace_.prepare_queue_ == nullptr) {
return;
}
@@ -654,26 +632,25 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
// Reset tracing state to idle
trace_.prepared_disp_count_ = 0;
- trace_.sqtt_disp_count_ = 0;
- trace_.gpa_sample_id_ = 0;
- trace_.status_ = TraceStatus::Idle;
- trace_.prepare_queue_ = nullptr;
- trace_.begin_queue_ = nullptr;
+ trace_.sqtt_disp_count_ = 0;
+ trace_.gpa_sample_id_ = 0;
+ trace_.status_ = TraceStatus::Idle;
+ trace_.prepare_queue_ = nullptr;
+ trace_.begin_queue_ = nullptr;
}
// ================================================================================================
// Destroys device-persistent RGP resources
-void RgpCaptureMgr::DestroyRGPTracing()
-{
+void RgpCaptureMgr::DestroyRGPTracing() {
if (trace_.status_ != TraceStatus::Idle) {
- FinishRGPTrace(nullptr, true);
+ FinishRGPTrace(nullptr, true);
}
delete user_event_;
// Destroy the GPA session
if (trace_.gpa_session_ != nullptr) {
- //Util::Destructor(trace_.gpa_session_);
+ // Util::Destructor(trace_.gpa_session_);
delete trace_.gpa_session_;
trace_.gpa_session_ = nullptr;
}
@@ -683,18 +660,15 @@ void RgpCaptureMgr::DestroyRGPTracing()
// ================================================================================================
// Returns true if the given device properties/settings support tracing.
-bool RgpCaptureMgr::GpuSupportsTracing(
- const Pal::DeviceProperties& props,
- const Settings& settings)
-{
+bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props,
+ const Settings& settings) {
return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
}
// ================================================================================================
// Called when a new device is created. This will preallocate reusable RGP trace resources
// for that device.
-void RgpCaptureMgr::PostDeviceCreate()
-{
+void RgpCaptureMgr::PostDeviceCreate() {
amd::ScopedLock traceLock(&trace_mutex_);
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
@@ -714,8 +688,7 @@ void RgpCaptureMgr::PostDeviceCreate()
// ================================================================================================
// Called prior to a device's being destroyed. This will free persistent RGP trace resources for
// that device.
-void RgpCaptureMgr::PreDeviceDestroy()
-{
+void RgpCaptureMgr::PreDeviceDestroy() {
amd::ScopedLock traceLock(&trace_mutex_);
// If we are idle, we can re-initialize trace resources based on the new device.
if (trace_.status_ == TraceStatus::Idle) {
@@ -725,9 +698,8 @@ void RgpCaptureMgr::PreDeviceDestroy()
// ================================================================================================
// Sets up an Event marker's basic data.
-RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
- const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
-{
+RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu,
+ RgpSqttMarkerEventType api_type) const {
RgpSqttMarkerEvent marker = {};
marker.identifier = RgpSqttMarkerIdentifierEvent;
@@ -739,24 +711,19 @@ RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
}
// ================================================================================================
-void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
-{
+void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const {
assert((data_size % sizeof(uint32_t)) == 0);
assert((data_size / sizeof(uint32_t)) > 0);
- gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
- static_cast(data_size / sizeof(uint32_t)), data);
+ gpu->queue(MainEngine)
+ .iCmd()
+ ->CmdInsertRgpTraceMarker(static_cast(data_size / sizeof(uint32_t)), data);
}
// ================================================================================================
// Inserts an RGP pre-dispatch marker
-void RgpCaptureMgr::WriteEventWithDimsMarker(
- const VirtualGPU* gpu,
- RgpSqttMarkerEventType apiType,
- uint32_t x,
- uint32_t y,
- uint32_t z) const
-{
+void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
+ uint32_t x, uint32_t y, uint32_t z) const {
assert(apiType != RgpSqttMarkerEventType::Invalid);
RgpSqttMarkerEventWithDims eventWithDims = {};
@@ -771,26 +738,24 @@ void RgpCaptureMgr::WriteEventWithDimsMarker(
}
// ================================================================================================
-void RgpCaptureMgr::WriteBarrierStartMarker(
- const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
-{
+void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
+ const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
RgpSqttMarkerBarrierStart marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
- marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
- marker.dword02 = data.reason;
- marker.internal = true;
+ marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
+ marker.dword02 = data.reason;
+ marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
-void RgpCaptureMgr::WriteBarrierEndMarker(
- const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
-{
+void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
+ const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
// Copy the operations part and include the same data from previous markers
@@ -799,28 +764,28 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
auto operations = data.operations;
operations.pipelineStalls.u16All |= 0;
- operations.caches.u16All |= 0;
+ operations.caches.u16All |= 0;
RgpSqttMarkerBarrierEnd marker = {};
- marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
- marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
+ marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
+ marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
- marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
- marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
- marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
- marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
- marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
- marker.syncCpDma = operations.pipelineStalls.syncCpDma;
- marker.invalTcp = operations.caches.invalTcp;
- marker.invalSqI = operations.caches.invalSqI$;
- marker.invalSqK = operations.caches.invalSqK$;
- marker.flushTcc = operations.caches.flushTcc;
- marker.invalTcc = operations.caches.invalTcc;
- marker.flushCb = operations.caches.flushCb;
- marker.invalCb = operations.caches.invalCb;
- marker.flushDb = operations.caches.flushDb;
- marker.invalDb = operations.caches.invalDb;
+ marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
+ marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
+ marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
+ marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
+ marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
+ marker.syncCpDma = operations.pipelineStalls.syncCpDma;
+ marker.invalTcp = operations.caches.invalTcp;
+ marker.invalSqI = operations.caches.invalSqI$;
+ marker.invalSqK = operations.caches.invalSqK$;
+ marker.flushTcc = operations.caches.flushTcc;
+ marker.invalTcc = operations.caches.invalTcc;
+ marker.flushCb = operations.caches.flushCb;
+ marker.invalCb = operations.caches.invalCb;
+ marker.flushDb = operations.caches.flushDb;
+ marker.invalDb = operations.caches.invalDb;
marker.numLayoutTransitions = 0;
@@ -830,9 +795,9 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
// ================================================================================================
// Inserts a user event string marker
-void RgpCaptureMgr::WriteUserEventMarker(
- const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
-{
+void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu,
+ RgpSqttMarkerUserEventType eventType,
+ const std::string& name) const {
memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));
user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
@@ -841,7 +806,8 @@ void RgpCaptureMgr::WriteUserEventMarker(
size_t markerSize = sizeof(user_event_->header);
if ((eventType != RgpSqttMarkerUserEventPop)) {
- size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
+ size_t strLength =
+ std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
uint32_t c = static_cast(name[charIdx]);
user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
@@ -859,4 +825,4 @@ void RgpCaptureMgr::WriteUserEventMarker(
}
-}; // namespace vk
+}; // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
index 52789a581e..af56f6efd3 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
@@ -34,42 +34,36 @@
#include "gpuopen.h"
// PAL forward declarations
-namespace Pal
-{
-class ICmdBuffer;
-class IFence;
-class IQueueSemaphore;
+namespace Pal {
+class ICmdBuffer;
+class IFence;
+class IQueueSemaphore;
struct PalPublicSettings;
-}
+} // namespace Pal
// GpuUtil forward declarations
-namespace GpuUtil
-{
+namespace GpuUtil {
class GpaSession;
};
// GPUOpen forward declarations
-namespace DevDriver
-{
+namespace DevDriver {
class DevDriverServer;
class IMsgChannel;
struct MessageBuffer;
-namespace DriverControlProtocol
-{
+namespace DriverControlProtocol {
enum struct DeviceClockMode : uint32_t;
class HandlerServer;
-}
+} // namespace DriverControlProtocol
-namespace SettingsProtocol
-{
+namespace SettingsProtocol {
class HandlerServer;
}
-}
+} // namespace DevDriver
-namespace pal
-{
+namespace pal {
class Settings;
class Device;
class VirtualGPU;
@@ -77,8 +71,7 @@ class HSAILKernel;
// ================================================================================================
// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
-enum RgpSqttMarkerIdentifier : uint32_t
-{
+enum RgpSqttMarkerIdentifier : uint32_t {
RgpSqttMarkerIdentifierEvent = 0x0,
RgpSqttMarkerIdentifierCbStart = 0x1,
RgpSqttMarkerIdentifierCbEnd = 0x2,
@@ -98,8 +91,7 @@ enum RgpSqttMarkerIdentifier : uint32_t
};
// ================================================================================================
-enum class RgpSqttMarkerEventType : uint32_t
-{
+enum class RgpSqttMarkerEventType : uint32_t {
CmdNDRangeKernel = 0,
CmdScheduler = 1,
CmdCopyBuffer = 2,
@@ -114,8 +106,7 @@ enum class RgpSqttMarkerEventType : uint32_t
};
// ================================================================================================
-enum class RgpSqqtBarrierReason : uint32_t
-{
+enum class RgpSqqtBarrierReason : uint32_t {
Invalid = 0,
MemDependency = 0xC0000000,
ProfilingControl = 0xC0000001,
@@ -125,129 +116,116 @@ enum class RgpSqqtBarrierReason : uint32_t
};
// ================================================================================================
-// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
+// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
// i.e. draws/dispatches (Table 4).
-struct RgpSqttMarkerEvent
-{
- union
- {
- struct
- {
- uint32_t identifier : 4; // Identifier for this marker
- uint32_t extDwords : 3; // Number of extra dwords following this marker
- uint32_t apiType : 24; // The API type for this command
- uint32_t hasThreadDims : 1; // Whether thread dimensions are included
+struct RgpSqttMarkerEvent {
+ union {
+ struct {
+ uint32_t identifier : 4; // Identifier for this marker
+ uint32_t extDwords : 3; // Number of extra dwords following this marker
+ uint32_t apiType : 24; // The API type for this command
+ uint32_t hasThreadDims : 1; // Whether thread dimensions are included
};
- uint32_t dword01; // The first dword
+ uint32_t dword01; // The first dword
};
- union
- {
- // Some information about the vertex/instance/draw register indices. These values are not
+ union {
+ // Some information about the vertex/instance/draw register indices. These values are not
// always valid because they are not available for one reason or another:
//
// - If vertex offset index or instance offset index are not (together) valid, they are both
// equal to 0
// - If draw index is not valid, it is equal to the vertex offset index
- struct
- {
- uint32_t cbID : 20; // Command buffer ID for this marker
+ struct {
+ uint32_t cbID : 20; // Command buffer ID for this marker
uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset
- uint32_t instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
- uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw indirect)
+ uint32_t
+ instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
+ uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw
+ // indirect)
};
- uint32_t dword02; // The second dword
+ uint32_t dword02; // The second dword
};
- union
- {
- uint32_t cmdID; // Command index within the command buffer
- uint32_t dword03; // The third dword
+ union {
+ uint32_t cmdID; // Command index within the command buffer
+ uint32_t dword03; // The third dword
};
};
// ================================================================================================
// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
-struct RgpSqttMarkerEventWithDims
-{
- RgpSqttMarkerEvent event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
- uint32_t threadX; // Work group count in X
- uint32_t threadY; // Work group count in Y
- uint32_t threadZ; // Work group count in Z
+struct RgpSqttMarkerEventWithDims {
+ RgpSqttMarkerEvent
+ event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
+ uint32_t threadX; // Work group count in X
+ uint32_t threadY; // Work group count in Y
+ uint32_t threadZ; // Work group count in Z
};
// ================================================================================================
// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
-struct RgpSqttMarkerBarrierStart
-{
- union
- {
- struct
- {
+struct RgpSqttMarkerBarrierStart {
+ union {
+ struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t reserved : 5; // Reserved
};
- uint32_t dword01; // The first dword
+ uint32_t dword01; // The first dword
};
- union
- {
- struct
- {
+ union {
+ struct {
uint32_t driverReason : 31;
- uint32_t internal: 1;
+ uint32_t internal : 1;
};
- uint32_t dword02; // The second dword
+ uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
-struct RgpSqttMarkerBarrierEnd
-{
- union
- {
- struct
- {
- uint32_t identifier : 4; // Identifier for this marker
- uint32_t extDwords : 3; // Number of extra dwords following this marker
- uint32_t cbId : 20; // Command buffer ID within queue
- uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that timestamp
- // to be written. Quintessential full pipeline stall.
+struct RgpSqttMarkerBarrierEnd {
+ union {
+ struct {
+ uint32_t identifier : 4; // Identifier for this marker
+ uint32_t extDwords : 3; // Number of extra dwords following this marker
+ uint32_t cbId : 20; // Command buffer ID within queue
+ uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
+ // timestamp to be written. Quintessential full pipeline stall.
uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete.
uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete.
uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete.
- uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
+ uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
};
- uint32_t dword01; // The first dword
+ uint32_t dword01; // The first dword
};
- union
- {
- struct
- {
- uint32_t syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
+ union {
+ struct {
+ uint32_t
+ syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
uint32_t invalTcp : 1; // Invalidate the L1 vector caches.
uint32_t invalSqI : 1; // Invalidate the SQ instruction caches
uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches)
uint32_t flushTcc : 1; // Flush L2
uint32_t invalTcc : 1; // Invalidate L2
- uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
- uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
- uint32_t flushDb : 1; // Flush DB caches (including htile)
- uint32_t invalDb : 1; // Invalidate DB caches (including htile)
- uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
- uint32_t reserved : 6; // Reserved for future expansion. Always 0
+ uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
+ uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
+ uint32_t flushDb : 1; // Flush DB caches (including htile)
+ uint32_t invalDb : 1; // Invalidate DB caches (including htile)
+ uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
+ uint32_t reserved : 6; // Reserved for future expansion. Always 0
};
- uint32_t dword02; // The second dword
+ uint32_t dword02; // The second dword
};
};
@@ -255,33 +233,31 @@ struct RgpSqttMarkerBarrierEnd
constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;
// RGP SQTT Instrumentation Specification version for Vulkan-specific tables
-constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
+constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
-// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user event
-enum RgpSqttMarkerUserEventType : uint32_t
-{
- RgpSqttMarkerUserEventTrigger = 0x0,
- RgpSqttMarkerUserEventPop = 0x1,
- RgpSqttMarkerUserEventPush = 0x2,
- RgpSqttMarkerUserEventObjectName = 0x3,
- RgpSqttMarkerUserEventReserved1 = 0x4,
- RgpSqttMarkerUserEventReserved2 = 0x5,
- RgpSqttMarkerUserEventReserved3 = 0x6,
- RgpSqttMarkerUserEventReserved4 = 0x7,
+// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
+// event
+enum RgpSqttMarkerUserEventType : uint32_t {
+ RgpSqttMarkerUserEventTrigger = 0x0,
+ RgpSqttMarkerUserEventPop = 0x1,
+ RgpSqttMarkerUserEventPush = 0x2,
+ RgpSqttMarkerUserEventObjectName = 0x3,
+ RgpSqttMarkerUserEventReserved1 = 0x4,
+ RgpSqttMarkerUserEventReserved2 = 0x5,
+ RgpSqttMarkerUserEventReserved3 = 0x6,
+ RgpSqttMarkerUserEventReserved4 = 0x7,
};
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
-union RgpSqttMarkerUserEvent
-{
- struct
- {
- uint32_t identifier : 4; // Identifier for this marker
- uint32_t extDwords : 8; // Number of extra dwords following this marker
- uint32_t dataType : 8; // The type for this marker
- uint32_t reserved : 12; // reserved
- };
+union RgpSqttMarkerUserEvent {
+ struct {
+ uint32_t identifier : 4; // Identifier for this marker
+ uint32_t extDwords : 8; // Number of extra dwords following this marker
+ uint32_t dataType : 8; // The type for this marker
+ uint32_t reserved : 12; // reserved
+ };
- uint32_t dword01; // The first dword
+ uint32_t dword01; // The first dword
};
constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
@@ -289,21 +265,20 @@ constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
// The max lengths of frame marker strings
static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;
-// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and trigger data types)
-struct RgpSqttMarkerUserEventWithString
-{
- RgpSqttMarkerUserEvent header;
+// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
+// trigger data types)
+struct RgpSqttMarkerUserEventWithString {
+ RgpSqttMarkerUserEvent header;
- uint32_t stringLength; // Length of the string (in characters)
- uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
+ uint32_t stringLength; // Length of the string (in characters)
+ uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
};
// ================================================================================================
// This class provides functionality to interact with the GPU Open Developer Mode message passing
// service and the rest of the driver.
-class RgpCaptureMgr
-{
-public:
+class RgpCaptureMgr {
+ public:
~RgpCaptureMgr();
static RgpCaptureMgr* Create(Pal::IPlatform* platform, const Device& device);
@@ -321,45 +296,42 @@ public:
bool IsQueueTimingActive() const;
- void WriteBarrierStartMarker(
- const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
- void WriteBarrierEndMarker(
- const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
+ void WriteBarrierStartMarker(const VirtualGPU* gpu,
+ const Pal::Developer::BarrierData& data) const;
+ void WriteBarrierEndMarker(const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const;
- Pal::Result TimedQueueSubmit(
- Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
+ Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
+ const Pal::SubmitInfo& submitInfo) const;
-private:
+ private:
// Steps that an RGP trace goes through
- enum class TraceStatus
- {
- Idle = 0, // No active trace and none requested
- Preparing, // A trace has been requested but is not active yet because we are
- // currently sampling timing information over some number of lead frames.
- Running, // SQTT and queue timing is currently active for all command buffer submits.
- WaitingForSqtt,
- WaitingForResults // Tracing is no longer active, but all results are not yet ready.
+ enum class TraceStatus {
+ Idle = 0, // No active trace and none requested
+ Preparing, // A trace has been requested but is not active yet because we are
+ // currently sampling timing information over some number of lead frames.
+ Running, // SQTT and queue timing is currently active for all command buffer submits.
+ WaitingForSqtt,
+ WaitingForResults // Tracing is no longer active, but all results are not yet ready.
};
// All per-device state to support RGP tracing
- struct TraceState
- {
- TraceStatus status_; // Current trace status (idle, running, etc.)
+ struct TraceState {
+ TraceStatus status_; // Current trace status (idle, running, etc.)
- GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
- GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
- GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
+ GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
+ GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
+ GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
- VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
- VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
+ VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
+ VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
- GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
- uint32_t gpa_sample_id_; // Sample ID associated with the current trace
- bool queue_timing_; // Queue timing is enabled
+ GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
+ uint32_t gpa_sample_id_; // Sample ID associated with the current trace
+ bool queue_timing_; // Queue timing is enabled
- uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
- uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
- mutable uint32_t current_event_id_; // Current event ID
+ uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
+ uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
+ mutable uint32_t current_event_id_; // Current event ID
};
RgpCaptureMgr(Pal::IPlatform* platform, const Device& device);
@@ -374,25 +346,25 @@ private:
static bool GpuSupportsTracing(const Pal::DeviceProperties& props, const Settings& settings);
RgpSqttMarkerEvent BuildEventMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const;
void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const;
- void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
- uint32_t x, uint32_t y, uint32_t z) const;
+ void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x,
+ uint32_t y, uint32_t z) const;
void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType,
- const std::string& name) const;
+ const std::string& name) const;
- const Device& device_;
+ const Device& device_;
DevDriver::DevDriverServer* dev_driver_server_;
DevDriver::RGPProtocol::RGPServer* rgp_server_;
- mutable amd::Monitor trace_mutex_;
- TraceState trace_;
+ mutable amd::Monitor trace_mutex_;
+ TraceState trace_;
RgpSqttMarkerUserEventWithString* user_event_;
- uint32_t num_prep_disp_;
- uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
- uint32_t trace_gpu_mem_limit_;
- uint32_t global_disp_count_;
+ uint32_t num_prep_disp_;
+ uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
+ uint32_t trace_gpu_mem_limit_;
+ uint32_t global_disp_count_;
- bool trace_enabled_; // True if tracing is currently enabled (master flag)
- bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
+ bool trace_enabled_; // True if tracing is currently enabled (master flag)
+ bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr);
PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr);
@@ -400,11 +372,9 @@ private:
// ================================================================================================
// Returns true if queue operations are currently being timed by RGP traces.
-inline bool RgpCaptureMgr::IsQueueTimingActive() const
-{
+inline bool RgpCaptureMgr::IsQueueTimingActive() const {
return (trace_.queue_timing_ &&
- (trace_.status_ == TraceStatus::Running ||
- trace_.status_ == TraceStatus::Preparing ||
+ (trace_.status_ == TraceStatus::Running || trace_.status_ == TraceStatus::Preparing ||
trace_.status_ == TraceStatus::WaitingForSqtt));
}
-};
+}; // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index e23389876b..7a4823ddaa 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -27,11 +27,9 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
namespace pal {
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
- const uint32_t groupSegmentSize,
- const uint16_t numSGPRs,
+ const uint32_t groupSegmentSize, const uint16_t numSGPRs,
const uint16_t numVGPRs) {
- workGroupInfo_.scratchRegs_ =
- amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
+ workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
workGroupInfo_.privateMemSize_ = privateSegmentSize;
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
workGroupInfo_.usedSGPRs_ = numSGPRs;
@@ -63,13 +61,13 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t
}
// Copy code object of this kernel from the program CPU segment
- memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
+ memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)),
+ sizeof(amd_kernel_code_t));
return true;
}
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
-
amd_kernel_code_t* akc = &akc_;
if (!setKernelCode(sym, akc)) {
@@ -77,18 +75,16 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
}
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
- reinterpret_cast(&codeSize_))) {
+ reinterpret_cast(&codeSize_))) {
return false;
}
- // Setup the the workgroup info
- setWorkGroupInfo(akc->workitem_private_segment_byte_size,
- akc->workgroup_group_segment_byte_size,
- akc->wavefront_sgpr_count,
- akc->workitem_vgpr_count);
+ // Setup the the workgroup info
+ setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
+ akc->wavefront_sgpr_count, akc->workitem_vgpr_count);
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
- kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
+ kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
return true;
@@ -102,16 +98,14 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
codeSize_(0),
workgroupGroupSegmentByteSize_(0),
kernargSegmentByteSize_(0),
- spillSegmentByteSize_(0)
- {
+ spillSegmentByteSize_(0) {
flags_.hsa_ = true;
}
-HSAILKernel::~HSAILKernel() {
-}
+HSAILKernel::~HSAILKernel() {}
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
-#if defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
acl_error error = ACL_SUCCESS;
std::string openClKernelName = openclMangledName(name());
flags_.internalKernel_ =
@@ -274,12 +268,14 @@ const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast(prog_);
}
-hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
- VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
- const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
+hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
+ const amd::NDRangeContainer& sizes,
+ const_address parameters,
+ size_t ldsAddress, uint64_t vmDefQueue,
+ uint64_t* vmParentWrap) const {
uint64_t argList;
address aqlArgBuf = gpu.managedBuffer().reserve(
- argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
+ argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
if (dynamicParallelism()) {
@@ -307,8 +303,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
if (sizes.dimensions() >= 2) {
- offset = sizes.offset()[1];
- WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
+ offset = sizes.offset()[1];
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
@@ -322,8 +318,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
// and set the fourth argument as the printf_buffer pointer
- size_t bufferPtr = static_cast(gpu.printfDbgHSA().
- dbgBuffer()->vmAddress());
+ size_t bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
WriteAqlArgAt(const_cast(parameters), &bufferPtr, it.size_, it.offset_);
}
@@ -346,11 +341,11 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// Note: In a case of structs the size won't match,
// since HSAIL compiler expects a reference...
assert(argsBufferSize() <= signature.paramsSize() &&
- "A mismatch of sizes of arguments between compiler and runtime!");
+ "A mismatch of sizes of arguments between compiler and runtime!");
- //hsa_kernel_dispatch_packet_t disp;
- hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast(
- gpu.cb(0)->SysMemCopy());
+ // hsa_kernel_dispatch_packet_t disp;
+ hsa_kernel_dispatch_packet_t* hsaDisp =
+ reinterpret_cast(gpu.cb(0)->SysMemCopy());
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
@@ -359,10 +354,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
constexpr uint16_t kDispatchPacketHeader =
- (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+ (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+ (1 << HSA_PACKET_HEADER_BARRIER) |
+ (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+ (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
@@ -387,7 +382,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
- AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+ AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
gpu.addVmMemory(gpu.hsaQueueMem());
}
@@ -407,7 +402,7 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
return nullptr;
}
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
@@ -419,7 +414,7 @@ bool LightningKernel::init() {
return false;
}
- KernelMD kernelMD;
+ KernelMD kernelMD;
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
@@ -427,8 +422,8 @@ bool LightningKernel::init() {
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
- spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
- sizeof(uint32_t));
+ spillSegmentByteSize_ =
+ amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, sizeof(uint32_t));
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
// Copy codeobject of this kernel from the program CPU segment
@@ -451,7 +446,7 @@ bool LightningKernel::init() {
// Get the runtime handle symbol GPU address
rth_symbol = prog().GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()),
- const_cast(&agent));
+ const_cast(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
@@ -461,19 +456,14 @@ bool LightningKernel::init() {
uint64_t kernel_object = gpuAqlCode();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
- const struct RuntimeHandle runtime_handle = {
- gpuAqlCode(),
- spillSegSize(),
- ldsSize()
- };
+ const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
// Setup the the workgroup info
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
- kernelMD.mCodeProps.mGroupSegmentFixedSize,
- kernelMD.mCodeProps.mNumSGPRs,
+ kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
kernelMD.mCodeProps.mNumVGPRs);
// Copy wavefront size
@@ -499,10 +489,10 @@ bool LightningKernel::init() {
return true;
}
-#endif // defined(USE_COMGR_LIBRARY)
+#endif // defined(USE_COMGR_LIBRARY)
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
@@ -545,7 +535,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
// Get the runtime handle symbol GPU address
rth_symbol = prog().GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()),
- const_cast(&agent));
+ const_cast(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
@@ -554,11 +544,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
- const struct RuntimeHandle runtime_handle = {
- gpuAqlCode(),
- spillSegSize(),
- ldsSize()
- };
+ const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
@@ -584,7 +570,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
waveLimiter_.enable();
*/
-#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
return true;
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index 5a1abe07d4..926d2deccc 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -20,14 +20,14 @@ namespace amd {
namespace hsa {
namespace loader {
class Symbol;
-} // loader
+} // namespace loader
namespace code {
namespace Kernel {
class Metadata;
-} // Kernel
-} // code
-} // hsa
-} // amd
+} // namespace Kernel
+} // namespace code
+} // namespace hsa
+} // namespace amd
//! \namespace pal PAL Device Implementation
namespace pal {
@@ -43,7 +43,6 @@ class LightningProgram;
*/
class HSAILKernel : public device::Kernel {
public:
-
HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions);
virtual ~HSAILKernel();
@@ -106,21 +105,19 @@ class HSAILKernel : public device::Kernel {
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
//! Set up the workgroup info based on the kernel metadata
- void setWorkGroupInfo(const uint32_t privateSegmentSize,
- const uint32_t groupSegmentSize,
- const uint16_t numSGPRs,
- const uint16_t numVGPRs);
+ void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize,
+ const uint16_t numSGPRs, const uint16_t numVGPRs);
- std::string compileOptions_; //!< compile used for finalizing this kernel
- amd_kernel_code_t akc_; //!< AQL kernel code on CPU
- uint index_; //!< Kernel index in the program
+ std::string compileOptions_; //!< compile used for finalizing this kernel
+ amd_kernel_code_t akc_; //!< AQL kernel code on CPU
+ uint index_; //!< Kernel index in the program
- uint64_t code_; //!< GPU memory pointer to the kernel
- size_t codeSize_; //!< Size of ISA code
+ uint64_t code_; //!< GPU memory pointer to the kernel
+ size_t codeSize_; //!< Size of ISA code
- uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
- uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
- uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
+ uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
+ uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
+ uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
};
class LightningKernel : public HSAILKernel {
@@ -140,4 +137,5 @@ class LightningKernel : public HSAILKernel {
#endif
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
index 071f17962a..bad5652845 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
@@ -23,27 +23,21 @@
namespace pal {
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size)
- : device::Memory(owner), Resource(gpuDev, size)
- , pinnedMemory_(nullptr)
- , parent_(nullptr) {
-
+ : device::Memory(owner), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(const Device& gpuDev, size_t size)
- : device::Memory(size), Resource(gpuDev, size)
- , pinnedMemory_(nullptr)
- , parent_(nullptr) {
-}
+ : device::Memory(size), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {}
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth,
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
- : device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
- , pinnedMemory_(nullptr)
- , parent_(nullptr) {
-
+ : device::Memory(owner),
+ Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
+ pinnedMemory_(nullptr),
+ parent_(nullptr) {
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
@@ -51,10 +45,10 @@ Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t he
Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth,
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
- : device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
- , pinnedMemory_(nullptr)
- , parent_(nullptr) {
-}
+ : device::Memory(size),
+ Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
+ pinnedMemory_(nullptr),
+ parent_(nullptr) {}
#ifdef _WIN32
static HANDLE getSharedHandle(IUnknown* pIface) {
@@ -130,7 +124,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
break;
case Resource::Remote:
case Resource::RemoteUSWC:
- if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
+ if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
@@ -402,7 +396,7 @@ Memory::~Memory() {
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
// Note: runtime will perform unmap on the actual resource destruction
- //unmap(nullptr);
+ // unmap(nullptr);
}
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index d84b23cbe6..2ce3062cce 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -32,12 +32,12 @@ class Memory : public device::Memory, public Resource {
Memory(const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t size //!< Memory size for allocation
- );
+ );
//! Constructor (nonfat version for local scratch mem use without heap block)
Memory(const Device& gpuDev, //!< GPU device object
size_t size //!< Memory size for allocation
- );
+ );
//! Constructor memory for images (without global heap allocation)
Memory(const Device& gpuDev, //!< GPU device object
@@ -48,7 +48,7 @@ class Memory : public device::Memory, public Resource {
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
- );
+ );
//! Constructor memory for images (without global heap allocation)
Memory(const Device& gpuDev, //!< GPU device object
@@ -59,7 +59,7 @@ class Memory : public device::Memory, public Resource {
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
- );
+ );
//! Default destructor
~Memory();
@@ -70,7 +70,7 @@ class Memory : public device::Memory, public Resource {
//! Overloads the resource create method
virtual bool create(Resource::MemoryType memType, //!< Memory type
Resource::CreateParams* params = NULL //!< Prameters for create
- );
+ );
//! Allocate memory for API-level maps
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
@@ -78,12 +78,12 @@ class Memory : public device::Memory, public Resource {
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
- );
+ );
//! Pins system memory associated with this memory object
virtual bool pinSystemMemory(void* hostPtr, //!< System memory address
size_t size //!< Size of allocated system memory
- );
+ );
//! Releases indirect map surface
virtual void releaseIndirectMap() { decIndMapCount(); }
@@ -96,15 +96,15 @@ class Memory : public device::Memory, public Resource {
uint numLayers = 0, //!< End layer for multilayer map
size_t* rowPitch = NULL, //!< Row pitch for the device memory
size_t* slicePitch = NULL //!< Slice pitch for the device memory
- );
+ );
//! Unmap the device memory
virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
- );
+ );
//! Updates device memory from the owner's host allocation
void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object
- //! Synchronization flags
+ //! Synchronization flags
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags());
//! Updates the owner's host allocation from device memory
@@ -115,11 +115,13 @@ class Memory : public device::Memory, public Resource {
//! Creates a view from current resource
virtual Memory* createBufferView(
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
- );
+ );
virtual uint64_t virtualAddress() const override { return vmAddress(); }
- virtual const address cpuSrd() const { return reinterpret_cast(const_cast(hwState())); }
+ virtual const address cpuSrd() const {
+ return reinterpret_cast(const_cast(hwState()));
+ }
//! Allocates host memory for synchronization with MGPU context
void mgpuCacheWriteBack();
@@ -161,8 +163,8 @@ class Memory : public device::Memory, public Resource {
//! Disable operator=
Memory& operator=(const Memory&);
- Memory* pinnedMemory_; //!< Memory used as pinned system memory
- const Memory* parent_; //!< Parent memory object
+ Memory* pinnedMemory_; //!< Memory used as pinned system memory
+ const Memory* parent_; //!< Parent memory object
};
class Buffer : public pal::Memory {
@@ -219,7 +221,7 @@ class Image : public pal::Memory {
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
- );
+ );
virtual uint64_t virtualAddress() const override { return hwSrd(); }
diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp
index edb8077161..69dd871300 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp
@@ -11,7 +11,7 @@
#ifndef isinf
#ifdef _MSC_VER
#define isinf(X) (!_finite(X) && !_isnan(X))
-#else //!_MSC_VER
+#else //!_MSC_VER
#define isinf(X) (std::isinf(X))
#endif //!_MSC_VER
#endif // isinf
@@ -19,7 +19,7 @@
#ifndef isnan
#ifdef _MSC_VER
#define isnan(X) (_isnan(X))
-#else //!_MSC_VER
+#else //!_MSC_VER
#define isnan(X) (std::isnan(X))
#endif //!_MSC_VER
#endif // isnan
@@ -55,14 +55,14 @@ class PrintfDbg : public amd::HeapObject {
bool init(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size //!< Kernel's workload
- );
+ );
//! Prints the kernel's debug informaiton from the buffer
- bool output(VirtualGPU& gpu, //!< Virtual GPU object
- bool printfEnabled, //!< checks for printf
- const amd::NDRange& size, //!< Kernel's workload
+ bool output(VirtualGPU& gpu, //!< Virtual GPU object
+ bool printfEnabled, //!< checks for printf
+ const amd::NDRange& size, //!< Kernel's workload
const std::vector& printfInfo //!< printf info
- );
+ );
//! Debug buffer size per workitem
size_t wiDbgSize() const { return wiDbgSize_; }
@@ -81,7 +81,7 @@ class PrintfDbg : public amd::HeapObject {
//! Allocates the debug buffer
bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory
- );
+ );
//! Returns TRUE if a float value has to be printed
bool checkFloat(const std::string& fmt //!< Format string
@@ -105,9 +105,9 @@ class PrintfDbg : public amd::HeapObject {
) const;
//! Displays the PrintfDbg
- void outputDbgBuffer(const device::PrintfInfo& info,//!< printf info
- const uint32_t* workitemData, //!< The PrintfDbg dump buffer
- size_t& i //!< index to the data in the buffer
+ void outputDbgBuffer(const device::PrintfInfo& info, //!< printf info
+ const uint32_t* workitemData, //!< The PrintfDbg dump buffer
+ size_t& i //!< index to the data in the buffer
) const;
private:
@@ -127,7 +127,7 @@ class PrintfDbg : public amd::HeapObject {
uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
size_t idx, //!< Workitem global index
bool* realloc //!< Returns TRUE if workitem reached the buffer limit
- );
+ );
//! Unamp the staged buffer
void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
@@ -145,13 +145,13 @@ class PrintfDbgHSA : public PrintfDbg {
//! Initializes the debug buffer before kernel's execution
bool init(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled //!< checks for printf
- );
+ );
//! Prints the kernel's debug informaiton from the buffer
- bool output(VirtualGPU& gpu, //!< Virtual GPU object
- bool printfEnabled, //!< checks for printf
+ bool output(VirtualGPU& gpu, //!< Virtual GPU object
+ bool printfEnabled, //!< checks for printf
const std::vector& printfInfo //!< printf info
- );
+ );
private:
//! Disable copy constructor
@@ -161,4 +161,5 @@ class PrintfDbgHSA : public PrintfDbg {
PrintfDbgHSA& operator=(const PrintfDbgHSA&);
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index ed788bda56..85d404e897 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -65,10 +65,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
align = amd::alignUp(align, sizeof(uint32_t));
amd::Memory* amd_mem_obj = new (prog.dev().context())
- amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
- // HIP requires SVM allocation for segment code due to possible global variable access and
- // global variables are a part of code segment with the latest loader
- amd::IS_HIP ? reinterpret_cast(1) : nullptr);
+ amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
+ // HIP requires SVM allocation for segment code due to possible global variable
+ // access and global variables are a part of code segment with the latest loader
+ amd::IS_HIP ? reinterpret_cast(1) : nullptr);
if (amd_mem_obj == nullptr) {
LogError("[OCL] failed to create a mem object!");
@@ -103,9 +103,9 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
if (zero && !prog.isInternal()) {
uint64_t pattern = 0;
- size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
- prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
- amd::Coord3D(0), amd::Coord3D(size));
+ size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
+ prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
+ amd::Coord3D(size));
}
switch (segment) {
@@ -237,7 +237,7 @@ inline static std::vector splitSpaceSeparatedString(char* str) {
}
bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize) {
-#if defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
// ACL_TYPE_CG stage is not performed for offline compilation
hsa_agent_t agent;
agent.handle = 1;
@@ -262,8 +262,8 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
}
size_t kernelNamesSize = 0;
- acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES,
- nullptr, nullptr, &kernelNamesSize);
+ acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
+ nullptr, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
return false;
@@ -274,11 +274,11 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
&kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
- delete [] kernelNames;
+ delete[] kernelNames;
return false;
}
std::vector vKernels = splitSpaceSeparatedString(kernelNames);
- delete [] kernelNames;
+ delete[] kernelNames;
bool dynamicParallelism = false;
for (const auto& it : vKernels) {
std::string kernelName(it);
@@ -338,12 +338,10 @@ bool HSAILProgram::allocKernelTable() {
return true;
}
-void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const {
- gpu.addVmMemory(&codeSegGpu());
-}
+void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); }
const aclTargetInfo& HSAILProgram::info(const char* str) {
-#if defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
acl_error err;
std::string arch = "hsail";
if (dev().settings().use64BitPtr_) {
@@ -359,7 +357,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
}
bool HSAILProgram::saveBinaryAndSetType(type_t type) {
-#if defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
// Write binary to memory
if (rawBinary_ != nullptr) {
// Free memory containing rawBinary
@@ -378,8 +376,8 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) {
return true;
}
-bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
- size_t* bytes, const char* global_name) const {
+bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes,
+ const char* global_name) const {
uint32_t length = 0;
size_t offset = 0;
uint32_t flags = 0;
@@ -456,7 +454,7 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
}
/* Retrieve the Offset from global pal::Memory created @ segment::alloc */
- if(!codeSegment_->gpuAddressOffset(reinterpret_cast(*device_pptr), &offset)) {
+ if (!codeSegment_->gpuAddressOffset(reinterpret_cast(*device_pptr), &offset)) {
buildLog_ += "Error: Cannot Retrieve the Address Offset";
buildLog_ += "\n";
return false;
@@ -484,13 +482,12 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
hsa_isa_t isa = {0};
- uint32_t gfxip = 0;
+ uint32_t gfxip = 0;
std::string gfx_target(name);
if (gfx_target.find("amdgcn-") == 0) {
std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3);
gfxip = std::atoi(gfxip_version_str.c_str());
- }
- else {
+ } else {
// FIXME: Old way. To be remove.
uint32_t shift = 1;
size_t last = gfx_target.length();
@@ -508,9 +505,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
}
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
- uint32_t gfxipVersion = program_->dev().settings().useLightning_ ?
- program_->dev().hwInfo()->gfxipVersionLC_ :
- program_->dev().hwInfo()->gfxipVersion_;
+ uint32_t gfxipVersion = program_->dev().settings().useLightning_
+ ? program_->dev().hwInfo()->gfxipVersionLC_
+ : program_->dev().hwInfo()->gfxipVersion_;
uint32_t majorSrc = gfxipVersion / 10;
uint32_t minorSrc = gfxipVersion % 10;
@@ -519,11 +516,9 @@ bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
if (majorSrc != majorTrg) {
return false;
- }
- else if (minorTrg == minorSrc) {
+ } else if (minorTrg == minorSrc) {
return true;
- }
- else if (minorTrg < minorSrc) {
+ } else if (minorTrg < minorSrc) {
LogWarning("ISA downgrade for execution!");
return true;
}
@@ -708,7 +703,7 @@ static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executabl
return HSA_STATUS_SUCCESS;
}
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
bool LightningProgram::createBinary(amd::option::Options* options) {
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -716,7 +711,7 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
LogError("Failed to create ELF binary image!");
return false;
}
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}
@@ -752,10 +747,10 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
}
#if defined(USE_COMGR_LIBRARY)
- for (const auto &kernelMeta : kernelMetadataMap_) {
+ for (const auto& kernelMeta : kernelMetadataMap_) {
auto kernelName = kernelMeta.first;
- auto kernel = new LightningKernel(kernelName, this,
- options->origOptionStr + ProcessOptions(options));
+ auto kernel =
+ new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
kernels()[kernelName] = kernel;
if (!kernel->init()) {
@@ -804,9 +799,9 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
maxScratchRegs_ =
std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
-#endif // defined(USE_COMGR_LIBRARY)
+#endif // defined(USE_COMGR_LIBRARY)
DestroySegmentCpuAccess();
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
index 32e98aab6f..ddc41c0c1d 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
@@ -9,15 +9,15 @@
namespace amd {
namespace option {
class Options;
-} // option
+} // namespace option
namespace hsa {
namespace loader {
class Loader;
class Executable;
class Context;
-} // loader
-} // hsa
-} // amd
+} // namespace loader
+} // namespace hsa
+} // namespace amd
//! \namespace pal PAL Device Implementation
namespace pal {
@@ -50,15 +50,16 @@ class Segment : public amd::HeapObject {
bool gpuAddressOffset(uint64_t offAddr, size_t* offset);
//! Returns address for CPU access in the segment
- void* cpuAddress(size_t offset) const
- { return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; }
+ void* cpuAddress(size_t offset) const {
+ return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset;
+ }
void DestroyCpuAccess();
private:
- Memory* gpuAccess_; //!< GPU memory for segment access
- Memory* cpuAccess_; //!< CPU memory for segment (backing store)
- address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
+ Memory* gpuAccess_; //!< GPU memory for segment access
+ Memory* cpuAccess_; //!< CPU memory for segment (backing store)
+ address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
};
class PALHSALoaderContext final : public Context {
@@ -166,7 +167,7 @@ class HSAILProgram : public device::Program {
}
//! Get symbol by name
- amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
+ amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
return executable_->GetSymbol(symbol_name, agent);
}
@@ -180,11 +181,14 @@ class HSAILProgram : public device::Program {
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize) override;
//! Destroys CPU allocations in the code segment
- void DestroySegmentCpuAccess() const
- { if (codeSegment_ != nullptr) { codeSegment_->DestroyCpuAccess(); } }
+ void DestroySegmentCpuAccess() const {
+ if (codeSegment_ != nullptr) {
+ codeSegment_->DestroyCpuAccess();
+ }
+ }
- virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr,
- size_t* bytes, const char* globalName) const;
+ virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, size_t* bytes,
+ const char* globalName) const;
private:
//! Disable default copy constructor
@@ -201,7 +205,7 @@ class HSAILProgram : public device::Program {
std::vector globalStores_; //!< Global memory for the program
Memory* kernels_; //!< Table with kernel object pointers
Memory* codeSegGpu_; //!< GPU memory with code objects
- Segment* codeSegment_; //!< Pointer to the code segment for this program
+ Segment* codeSegment_; //!< Pointer to the code segment for this program
uint
maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
std::list staticSamplers_; //!< List od internal static samplers
@@ -214,19 +218,17 @@ class HSAILProgram : public device::Program {
//! \class Lightning Compiler Program
class LightningProgram : public HSAILProgram {
public:
- LightningProgram(NullDevice& device)
- : HSAILProgram(device) {
- isLC_ = true;
- xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
- machineTarget_ = dev().hwInfo()->machineTargetLC_;
- }
+ LightningProgram(NullDevice& device) : HSAILProgram(device) {
+ isLC_ = true;
+ xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
+ machineTarget_ = dev().hwInfo()->machineTargetLC_;
+ }
- LightningProgram(Device& device)
- : HSAILProgram(device) {
- isLC_ = true;
- xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
- machineTarget_ = dev().hwInfo()->machineTargetLC_;
- }
+ LightningProgram(Device& device) : HSAILProgram(device) {
+ isLC_ = true;
+ xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
+ machineTarget_ = dev().hwInfo()->machineTargetLC_;
+ }
virtual ~LightningProgram() {}
protected:
@@ -235,4 +237,5 @@ class LightningProgram : public HSAILProgram {
virtual bool createBinary(amd::option::Options* options) override;
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index e3a719cc38..088978846b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -41,8 +41,8 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
if (memRef != nullptr) {
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
if ((result != Pal::Result::Success) &&
- // Free cache if PAL failed allocation
- dev.resourceCache().free()) {
+ // Free cache if PAL failed allocation
+ dev.resourceCache().free()) {
// If cache was freed, then try to allocate again
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
}
@@ -154,8 +154,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
// ================================================================================================
GpuMemoryReference::GpuMemoryReference(const Device& dev)
- : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr)
-{}
+ : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {}
// ================================================================================================
GpuMemoryReference::~GpuMemoryReference() {
@@ -181,8 +180,7 @@ GpuMemoryReference::~GpuMemoryReference() {
iMem()->Unmap();
}
if (0 != iMem()) {
- if (!(iMem()->Desc().flags.isShared ||
- iMem()->Desc().flags.isExternal ||
+ if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal ||
iMem()->Desc().flags.isExternPhys)) {
// Update free memory size counters
device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true);
@@ -368,7 +366,7 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
case Persistent:
createInfo->heapCount = 2;
createInfo->heaps[0] = Pal::GpuHeapLocal;
- createInfo->heaps[1] = Pal:: GpuHeapGartUswc;
+ createInfo->heaps[1] = Pal::GpuHeapGartUswc;
#ifdef ATI_OS_LINUX
// Note: SSG in Linux requires DGMA heap
if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) {
@@ -401,11 +399,10 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
}
// ================================================================================================
-bool Resource::CreateImage(CreateParams* params)
-{
+bool Resource::CreateImage(CreateParams* params) {
Pal::Result result;
- Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
- Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+ Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
+ Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
@@ -417,8 +414,7 @@ bool Resource::CreateImage(CreateParams* params)
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
offset_ += viewOwner_->offset_;
- }
- else {
+ } else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize();
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
@@ -427,8 +423,8 @@ bool Resource::CreateImage(CreateParams* params)
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
// createInfo.priority;
- memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, nullptr, &subOffset_);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+ nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -477,16 +473,16 @@ bool Resource::CreateImage(CreateParams* params)
imgCreateInfo.arraySize = 1;
switch (desc_.topology_) {
- case CL_MEM_OBJECT_IMAGE3D:
- imgCreateInfo.imageType = Pal::ImageType::Tex3d;
- viewInfo.viewType = Pal::ImageViewType::Tex3d;
- break;
- case CL_MEM_OBJECT_IMAGE1D:
- case CL_MEM_OBJECT_IMAGE1D_ARRAY:
- case CL_MEM_OBJECT_IMAGE1D_BUFFER:
- imgCreateInfo.imageType = Pal::ImageType::Tex1d;
- viewInfo.viewType = Pal::ImageViewType::Tex1d;
- break;
+ case CL_MEM_OBJECT_IMAGE3D:
+ imgCreateInfo.imageType = Pal::ImageType::Tex3d;
+ viewInfo.viewType = Pal::ImageViewType::Tex3d;
+ break;
+ case CL_MEM_OBJECT_IMAGE1D:
+ case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+ case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+ imgCreateInfo.imageType = Pal::ImageType::Tex1d;
+ viewInfo.viewType = Pal::ImageViewType::Tex1d;
+ break;
}
if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
@@ -504,8 +500,7 @@ bool Resource::CreateImage(CreateParams* params)
ImgSubresRange.startSubres.arraySlice = imageView->layer_;
viewOwner_ = imageView->resource_;
image_ = viewOwner_->image_;
- }
- else if (memoryType() == ImageBuffer) {
+ } else if (memoryType() == ImageBuffer) {
ImageBufferParams* imageBuffer = reinterpret_cast(params);
viewOwner_ = imageBuffer->resource_;
}
@@ -515,11 +510,11 @@ bool Resource::CreateImage(CreateParams* params)
ImgSubresRange.numMips = desc().mipLevels_;
if ((memoryType() != ImageView) ||
- //! @todo PAL doesn't allow an SRD view creation with different pixel size
- (elementSize() != viewOwner_->elementSize())) {
+ //! @todo PAL doesn't allow an SRD view creation with different pixel size
+ (elementSize() != viewOwner_->elementSize())) {
imgCreateInfo.usageFlags.shaderRead = true;
imgCreateInfo.usageFlags.shaderWrite =
- (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
+ (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
imgCreateInfo.swizzledFormat.format = format;
imgCreateInfo.swizzledFormat.swizzle = channels;
imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
@@ -529,10 +524,9 @@ bool Resource::CreateImage(CreateParams* params)
uint32_t rowPitch = 0;
if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) ||
- (memoryType() == ImageBuffer)) {
+ (memoryType() == ImageBuffer)) {
tiling = Pal::ImageTiling::Linear;
- }
- else if (memoryType() == ImageView) {
+ } else if (memoryType() == ImageView) {
tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
// Find the new pitch in pixels for the new format
rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize();
@@ -540,10 +534,9 @@ bool Resource::CreateImage(CreateParams* params)
if (memoryType() == ImageBuffer) {
if ((params->owner_ != NULL) && params->owner_->asImage() &&
- (params->owner_->asImage()->getRowPitch() != 0)) {
+ (params->owner_->asImage()->getRowPitch() != 0)) {
rowPitch = params->owner_->asImage()->getRowPitch() / elementSize();
- }
- else {
+ } else {
rowPitch = desc().width_;
}
}
@@ -579,8 +572,8 @@ bool Resource::CreateImage(CreateParams* params)
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
- memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, nullptr, &subOffset_);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+ nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -589,8 +582,7 @@ bool Resource::CreateImage(CreateParams* params)
}
}
offset_ += static_cast(subOffset_);
- }
- else {
+ } else {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
@@ -627,11 +619,10 @@ bool Resource::CreateImage(CreateParams* params)
}
// ================================================================================================
-bool Resource::CreateInterop(CreateParams* params)
-{
+bool Resource::CreateInterop(CreateParams* params) {
Pal::Result result;
- Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
- Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+ Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
+ Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {};
@@ -645,21 +636,21 @@ bool Resource::CreateInterop(CreateParams* params)
OGLInteropParams* oglRes = reinterpret_cast(params);
assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
switch (oglRes->type_) {
- case InteropVertexBuffer:
- glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
- break;
- case InteropRenderBuffer:
- glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
- break;
- case InteropTexture:
- case InteropTextureViewLevel:
- case InteropTextureViewCube:
- glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
- break;
- default:
- LogError("Unknown OGL interop type!");
- return false;
- break;
+ case InteropVertexBuffer:
+ glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
+ break;
+ case InteropRenderBuffer:
+ glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
+ break;
+ case InteropTexture:
+ case InteropTextureViewLevel:
+ case InteropTextureViewCube:
+ glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
+ break;
+ default:
+ LogError("Unknown OGL interop type!");
+ return false;
+ break;
}
glPlatformContext_ = oglRes->glPlatformContext_;
layer = oglRes->layer_;
@@ -667,17 +658,18 @@ bool Resource::CreateInterop(CreateParams* params)
mipLevel = oglRes->mipLevel_;
if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_,
- &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
+ &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
#ifdef ATI_OS_WIN
- , openInfo.doppDesktopInfo
+ ,
+ openInfo.doppDesktopInfo
#endif
- )) {
+ )) {
return false;
}
desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0);
format = dev().getPalFormat(desc().format_, &channels);
}
-#ifdef ATI_OS_WIN
+#ifdef ATI_OS_WIN
else {
D3DInteropParams* d3dRes = reinterpret_cast(params);
openInfo.hExternalResource = d3dRes->handle_;
@@ -713,8 +705,8 @@ bool Resource::CreateInterop(CreateParams* params)
size_t gpuMemSize;
if (Pal::Result::Success !=
- dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
- &imgCreateInfo)) {
+ dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
+ &imgCreateInfo)) {
return false;
}
@@ -736,51 +728,51 @@ bool Resource::CreateInterop(CreateParams* params)
imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch;
switch (misc) {
- case 1: // NV12 or P010 formats
- switch (layer) {
- case -1:
- case 0:
+ case 1: // NV12 or P010 formats
+ switch (layer) {
+ case -1:
+ case 0:
+ break;
+ case 1:
+ // Y - plane size to the offset
+ // NV12 format. UV is 2 times smaller plane Y
+ viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+ imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+ break;
+ default:
+ LogError("Unknown Interop View Type");
+ return false;
+ }
break;
- case 1:
- // Y - plane size to the offset
- // NV12 format. UV is 2 times smaller plane Y
- viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+ case 2: // YV12 format
+ switch (layer) {
+ case -1:
+ case 0:
+ break;
+ case 1:
+ // Y - plane size to the offset
+ // YV12 format. U is 4 times smaller plane than Y
+ viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+ imgCreateInfo.rowPitch >>= 1;
+ break;
+ case 2:
+ // Y + U plane sizes to the offest.
+ // U plane is 4 times smaller than Y and U == V
+ viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
+ imgCreateInfo.rowPitch >>= 1;
+ break;
+ default:
+ LogError("Unknown Interop View Type");
+ return false;
+ }
+ imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+ break;
+ case 3: // YUY2 format
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
- }
- break;
- case 2: // YV12 format
- switch (layer) {
- case -1:
- case 0:
- break;
- case 1:
- // Y - plane size to the offset
- // YV12 format. U is 4 times smaller plane than Y
- viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
- imgCreateInfo.rowPitch >>= 1;
- break;
- case 2:
- // Y + U plane sizes to the offest.
- // U plane is 4 times smaller than Y and U == V
- viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
- imgCreateInfo.rowPitch >>= 1;
- break;
- default:
- LogError("Unknown Interop View Type");
- return false;
- }
- imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
- break;
- case 3: // YUY2 format
- imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
- break;
- default:
- LogError("Unknown Interop View Type");
- return false;
}
imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
@@ -820,8 +812,7 @@ bool Resource::CreateInterop(CreateParams* params)
hwState_[10] = static_cast(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
}
- }
- else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+ } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
if (nullptr == memRef_) {
return false;
@@ -842,8 +833,7 @@ bool Resource::CreateInterop(CreateParams* params)
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
- }
- else {
+ } else {
Pal::ExternalImageOpenInfo imgOpenInfo = {};
Pal::ImageCreateInfo imgCreateInfo = {};
imgOpenInfo.resourceInfo = openInfo;
@@ -865,14 +855,14 @@ bool Resource::CreateInterop(CreateParams* params)
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
viewInfo.viewType = Pal::ImageViewType::Tex2d;
switch (imgCreateInfo.imageType) {
- case Pal::ImageType::Tex3d:
- viewInfo.viewType = Pal::ImageViewType::Tex3d;
- break;
- case Pal::ImageType::Tex1d:
- viewInfo.viewType = Pal::ImageViewType::Tex1d;
- break;
- default:
- break;
+ case Pal::ImageType::Tex3d:
+ viewInfo.viewType = Pal::ImageViewType::Tex3d;
+ break;
+ case Pal::ImageType::Tex1d:
+ viewInfo.viewType = Pal::ImageViewType::Tex1d;
+ break;
+ default:
+ break;
}
viewInfo.pImage = image_;
viewInfo.swizzledFormat.format = format;
@@ -897,14 +887,13 @@ bool Resource::CreateInterop(CreateParams* params)
//! It's a workaround for D24S8 format, since PAL doesn't support this format
//! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility
if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) &&
- (desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
- if (dev().settings().gfx10Plus_) {
- hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
- }
- else {
- hwState_[1] &= ~0x3c000000;
- hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
- }
+ (desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
+ if (dev().settings().gfx10Plus_) {
+ hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
+ } else {
+ hwState_[1] &= ~0x3c000000;
+ hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
+ }
}
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
@@ -915,8 +904,7 @@ bool Resource::CreateInterop(CreateParams* params)
}
// ================================================================================================
-bool Resource::CreatePinned(CreateParams* params)
-{
+bool Resource::CreatePinned(CreateParams* params) {
PinnedParams* pinned = reinterpret_cast(params);
size_t allocSize = pinned->size_;
const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_;
@@ -926,7 +914,7 @@ bool Resource::CreatePinned(CreateParams* params)
if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast(
- amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment));
+ amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment));
// Find the partial size for unaligned copy
hostMemOffset = static_cast(reinterpret_cast(address_) - tmpHost);
@@ -940,18 +928,16 @@ bool Resource::CreatePinned(CreateParams* params)
}
allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
// hostMemOffset &= ~(0xff);
- }
- else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
+ } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
//! @todo: Width has to be aligned for 3D.
//! Need to be replaced with a compute copy
// Width aligned by 8 texels
if (((desc().width_ % 0x8) != 0) ||
- // Pitch aligned by 64 bytes
- (((desc().width_ * elementSize()) % 0x40) != 0)) {
+ // Pitch aligned by 64 bytes
+ (((desc().width_ * elementSize()) % 0x40) != 0)) {
return false;
}
- }
- else {
+ } else {
//! @todo GSL doesn't support pinning with resAlloc_
return false;
}
@@ -978,8 +964,7 @@ bool Resource::CreatePinned(CreateParams* params)
}
// ================================================================================================
-bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
-{
+bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) {
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
size_t allocSize = amd::alignUp(desc().width_ * elementSize_,
dev().properties().gpuMemoryProperties.fragmentSize);
@@ -991,20 +976,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
- }
- else {
+ } else {
createInfo.flags.useReservedGpuVa = false;
createInfo.pReservedGpuVaOwner = nullptr;
}
if (!dev().settings().svmFineGrainSystem_) {
- memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+ createInfo.pReservedGpuVaOwner, &subOffset_);
}
if (memRef_ == nullptr) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
- }
- else {
+ } else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = allocSize;
createInfo.alignment = MaxGpuAlignment;
@@ -1015,8 +998,8 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
memTypeToHeap(&createInfo);
- memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+ createInfo.pReservedGpuVaOwner, &subOffset_);
if (memRef_ == nullptr) {
createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
@@ -1028,9 +1011,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
}
desc_.cardMemory_ = false;
if ((nullptr != params) && (nullptr != params->owner_) &&
- (nullptr != params->owner_->getSvmPtr())) {
+ (nullptr != params->owner_->getSvmPtr())) {
params->owner_->setSvmPtr(
- reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
+ reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
offset_ += static_cast(subOffset_);
}
return true;
@@ -1126,18 +1109,18 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
Pal::gpusize svmPtr = 0;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
- svmPtr = reinterpret_cast(params->owner_->getSvmPtr());
- desc_.SVMRes_ = true;
- svmPtr = (svmPtr == 1) ? 0 : svmPtr;
+ svmPtr = reinterpret_cast(params->owner_->getSvmPtr());
+ desc_.SVMRes_ = true;
+ svmPtr = (svmPtr == 1) ? 0 : svmPtr;
}
if (desc_.SVMRes_) {
- return CreateSvm(params, svmPtr);
+ return CreateSvm(params, svmPtr);
}
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize_;
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
- createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment;
+ createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
@@ -1152,8 +1135,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
memTypeToHeap(&createInfo);
// createInfo.priority;
- memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, nullptr, &subOffset_);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+ nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -1172,14 +1155,13 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
}
// ================================================================================================
-void Resource::free()
-{
+void Resource::free() {
if (memRef_ == nullptr) {
return;
}
const bool wait =
- (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
+ (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
// and resource can be reused on another async queue without a wait on a busy operation
@@ -1190,8 +1172,7 @@ void Resource::free()
for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->waitForEvent(&events_[idx]);
}
- }
- else {
+ } else {
amd::ScopedLock l(memRef_->gpu_->execution());
memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
}
@@ -1232,8 +1213,7 @@ void Resource::free()
// ================================================================================================
void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data,
- bool waitForEvent) const
-{
+ bool waitForEvent) const {
GpuEvent event;
// Write data size bytes to surface
@@ -1242,7 +1222,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
gpu.eventBegin(MainEngine);
gpu.queue(MainEngine).addCmdMemRef(memRef());
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size,
- reinterpret_cast(data));
+ reinterpret_cast(data));
gpu.eventEnd(MainEngine, event);
if (waitForEvent) {
@@ -1259,8 +1239,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
}
// ================================================================================================
-static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
-{
+static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) {
if (bytesPerElement == 16) {
return Pal::ChNumFormat::X32Y32Z32W32_Uint;
} else if (bytesPerElement == 8) {
@@ -1292,8 +1271,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
if (desc().buffer_ && !dstResource.desc().buffer_) {
imageOffsetx = dstOrigin[0] % dstResource.elementSize();
gpuMemoryOffset = srcOrigin[0] + offset();
- gpuMemoryRowPitch =
- (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
+ gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
@@ -1374,7 +1352,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
- copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2]
+ copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
+ ? dstOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region);
} else {
@@ -1819,17 +1798,14 @@ void Resource::unmap(VirtualGPU* gpu) {
}
// ================================================================================================
-void Resource::unmapLayers(VirtualGPU* gpu) {
- Unimplemented();
-}
+void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); }
// ================================================================================================
bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
- MemBuddyAllocator* allocator = new MemBuddyAllocator(
- device_, device_->settings().subAllocationChunkSize_,
- device_->settings().subAllocationMinSize_);
- if (!((allocator != nullptr) &&
- (allocator->Init() == Pal::Result::Success) &&
+ MemBuddyAllocator* allocator =
+ new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_,
+ device_->settings().subAllocationMinSize_);
+ if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) &&
heaps_.insert({mem_ref, allocator}).second)) {
mem_ref->release();
delete allocator;
@@ -1890,8 +1866,7 @@ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
}
// ================================================================================================
-MemorySubAllocator::~MemorySubAllocator()
-{
+MemorySubAllocator::~MemorySubAllocator() {
// Release memory heap for suballocations
for (const auto& it : heaps_) {
it.first->release();
@@ -1901,8 +1876,8 @@ MemorySubAllocator::~MemorySubAllocator()
// ================================================================================================
GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
- const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
-{
+ const Pal::IGpuMemory* reserved_va,
+ Pal::gpusize* offset) {
GpuMemoryReference* mem_ref = nullptr;
MemBuddyAllocator* allocator = nullptr;
// Check if the resource size and alignment are allowed for suballocation
@@ -1927,7 +1902,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
}
// We didn't find a valid chunk, so create a new one
if (!CreateChunk(reserved_va)) {
- return nullptr;
+ return nullptr;
}
i++;
} while (i < 2);
@@ -1936,8 +1911,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
}
// ================================================================================================
-bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
-{
+bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) {
bool release_mem = false;
{
amd::ScopedLock l(monitor);
@@ -1966,9 +1940,8 @@ ResourceCache::~ResourceCache() { free(); }
// ================================================================================================
//! \note the cache works in FILO mode
-bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
- GpuMemoryReference* ref, Pal::gpusize offset)
-{
+bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref,
+ Pal::gpusize offset) {
bool result = false;
size_t size = ref->iMem()->Desc().size;
@@ -2017,7 +1990,9 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
// ================================================================================================
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
- Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
+ Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va,
+ Pal::gpusize* offset) {
amd::ScopedLock l(&lockCacheOps_);
GpuMemoryReference* ref = nullptr;
@@ -2051,7 +2026,7 @@ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal
ref = it.second;
cacheSize_ -= sizeRes;
if (entry->type_ == Resource::Local) {
- lclCacheSize_ -= sizeRes;
+ lclCacheSize_ -= sizeRes;
}
delete it.first;
// Remove the found etry from the cache
@@ -2078,8 +2053,7 @@ bool ResourceCache::free(size_t minCacheEntries) {
}
// ================================================================================================
-void ResourceCache::removeLast()
-{
+void ResourceCache::removeLast() {
std::pair entry;
{
// Protect access to the global data
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index 9b4c63f24a..c2fb0bcad0 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -41,11 +41,11 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
//! Get PAL memory object
Pal::IGpuMemory* iMem() const { return gpuMem_; }
- Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
- void* cpuAddress_; //!< CPU address of this memory
- const Device& device_; //!< GPU device
+ Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
+ void* cpuAddress_; //!< CPU address of this memory
+ const Device& device_; //!< GPU device
//! @note: This field is necessary for the thread safe release only
- VirtualGPU* gpu_; //!< Resource will be used only on this queue
+ VirtualGPU* gpu_; //!< Resource will be used only on this queue
protected:
//! Default destructor
@@ -186,7 +186,7 @@ class Resource : public amd::HeapObject {
//! Constructor of 1D Resource object
Resource(const Device& gpuDev, //!< GPU device object
size_t size //!< Resource size
- );
+ );
//! Constructor of Image Resource object
Resource(const Device& gpuDev, //!< GPU device object
@@ -196,7 +196,7 @@ class Resource : public amd::HeapObject {
cl_image_format format, //!< resource format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels = 1 //!< Number of mip levels
- );
+ );
//! Destructor of the resource
virtual ~Resource();
@@ -207,7 +207,7 @@ class Resource : public amd::HeapObject {
*/
virtual bool create(MemoryType memType, //!< memory type
CreateParams* params = 0 //!< special parameters for resource allocation
- );
+ );
/*! \brief Copies a subregion of memory from one resource to another
*
@@ -253,14 +253,13 @@ class Resource : public amd::HeapObject {
Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
//! Returns a pointer to the memory reference
- GpuMemoryReference* memRef() const {return memRef_; }
+ GpuMemoryReference* memRef() const { return memRef_; }
//! Returns global memory offset
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
//! Returns global memory offset
- uint64_t vmSize() const
- { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
+ uint64_t vmSize() const { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
//! Returns global memory offset
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
@@ -279,11 +278,11 @@ class Resource : public amd::HeapObject {
// Optimization for multilayer map/unmap
uint startLayer = 0, //!< Start layer for multilayer map
uint numLayers = 0 //!< End layer for multilayer map
- );
+ );
//! Unlocks the resource if it was locked
void unmap(VirtualGPU* gpu //!< Virtual GPU device object
- );
+ );
//! Marks the resource as busy
void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object
@@ -303,7 +302,7 @@ class Resource : public amd::HeapObject {
uint flags = 0, //!< Map flags
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
- );
+ );
//! Performs host read from the resource GPU memory
bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object
@@ -312,7 +311,7 @@ class Resource : public amd::HeapObject {
const amd::Coord3D& size, //!< The number of bytes to write
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
- );
+ );
//! Gets the resource element size
uint elementSize() const { return elementSize_; }
@@ -377,7 +376,7 @@ class Resource : public amd::HeapObject {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
- Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
+ Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
}
}
@@ -390,33 +389,32 @@ class Resource : public amd::HeapObject {
protected:
/*! \brief Creates a PAL iamge object, associated with the resource
- *
- * \return True if we succesfully created a PAL resource
- */
- bool CreateImage(CreateParams* params //!< special parameters for resource allocation
- );
+ *
+ * \return True if we succesfully created a PAL resource
+ */
+ bool CreateImage(CreateParams* params //!< special parameters for resource allocation
+ );
/*! \brief Creates a PAL interop object, associated with the resource
- *
- * \return True if we succesfully created a PAL interop resource
- */
- bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
- );
+ *
+ * \return True if we succesfully created a PAL interop resource
+ */
+ bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
+ );
/*! \brief Creates a PAL pinned object, associated with the resource
- *
- * \return True if we succesfully created a PAL pinned resource
- */
- bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
- );
+ *
+ * \return True if we succesfully created a PAL pinned resource
+ */
+ bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
+ );
/*! \brief Creates a PAL SVM object, associated with the resource
- *
- * \return True if we succesfully created a PAL SVM resource
- */
+ *
+ * \return True if we succesfully created a PAL SVM resource
+ */
bool CreateSvm(CreateParams* params, //!< special parameters for resource allocation
- Pal::gpusize svmPtr
- );
+ Pal::gpusize svmPtr);
uint elementSize_; //!< Size of a single element in bytes
@@ -433,11 +431,11 @@ class Resource : public amd::HeapObject {
*/
void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object
uint flags = 0 //!< flags for the map operation
- );
+ );
//! Unlocks the resource with layers if it was locked
void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object
- );
+ );
//! Calls PAL to map a resource
void* gpuMemoryMap(size_t* pitch, //!< Pitch value for the image
@@ -454,7 +452,7 @@ class Resource : public amd::HeapObject {
//! Converts Resource memory type to the PAL heaps
void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info
- );
+ );
const Device& gpuDevice_; //!< GPU device
Descriptor desc_; //!< Descriptor for this resource
@@ -462,7 +460,7 @@ class Resource : public amd::HeapObject {
void* address_; //!< Physical address of this resource
size_t offset_; //!< Resource offset
GpuMemoryReference* memRef_; //!< PAL resource reference
- Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
+ Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
const Resource* viewOwner_; //!< GPU resource, which owns this view
void* glInteropMbRes_; //!< Mb Res handle
uint32_t glType_; //!< GL interop type
@@ -485,41 +483,35 @@ class Resource : public amd::HeapObject {
typedef Util::BuddyAllocator MemBuddyAllocator;
class MemorySubAllocator : public amd::HeapObject {
-public:
+ public:
MemorySubAllocator(Device* device) : device_(device) {}
~MemorySubAllocator();
//! Create suballocation
- GpuMemoryReference* Allocate(Pal::gpusize size,
- Pal::gpusize alignment,
- const Pal::IGpuMemory* reserved_va,
- Pal::gpusize* offset
- );
+ GpuMemoryReference* Allocate(Pal::gpusize size, Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset);
//! Free suballocation
- bool Free(amd::Monitor* monitor,
- GpuMemoryReference* mem_ref,
- Pal::gpusize offset
- );
+ bool Free(amd::Monitor* monitor, GpuMemoryReference* mem_ref, Pal::gpusize offset);
-protected:
+ protected:
//! Allocate new chunk of memory
virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
bool InitAllocator(GpuMemoryReference* mem_ref);
Device* device_;
- std::unordered_map heaps_;
+ std::unordered_map heaps_;
};
class CoarseMemorySubAllocator : public MemorySubAllocator {
-public:
+ public:
CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
};
class FineMemorySubAllocator : public MemorySubAllocator {
-public:
+ public:
FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
@@ -529,29 +521,28 @@ class ResourceCache : public amd::HeapObject {
public:
//! Default constructor
ResourceCache(Device* device, size_t cacheSizeLimit)
- : lockCacheOps_("PAL resource cache", true)
- , cacheSize_(0)
- , lclCacheSize_(0)
- , cacheSizeLimit_(cacheSizeLimit)
- , mem_sub_alloc_local_(device)
- , mem_sub_alloc_coarse_ (device)
- , mem_sub_alloc_fine_ (device) {}
+ : lockCacheOps_("PAL resource cache", true),
+ cacheSize_(0),
+ lclCacheSize_(0),
+ cacheSizeLimit_(cacheSizeLimit),
+ mem_sub_alloc_local_(device),
+ mem_sub_alloc_coarse_(device),
+ mem_sub_alloc_fine_(device) {}
//! Default destructor
~ResourceCache();
//! Adds a PAL resource to the cache
- bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
- GpuMemoryReference* ref, //!< Resource reference
- Pal::gpusize offset //!< Original resource offset
- );
+ bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
+ GpuMemoryReference* ref, //!< Resource reference
+ Pal::gpusize offset //!< Original resource offset
+ );
//! Finds a PAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
- Pal::gpusize size,
- Pal::gpusize alignment,
- const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
+ Pal::gpusize size, Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
Pal::gpusize* offset);
//! Destroys cache
@@ -576,16 +567,17 @@ class ResourceCache : public amd::HeapObject {
amd::Monitor lockCacheOps_; //!< Lock to serialise cache access
- size_t cacheSize_; //!< Current cache size in bytes
- size_t lclCacheSize_; //!< Local memory stored in the cache
- const size_t cacheSizeLimit_; //!< Cache size limit in bytes
+ size_t cacheSize_; //!< Current cache size in bytes
+ size_t lclCacheSize_; //!< Local memory stored in the cache
+ const size_t cacheSizeLimit_; //!< Cache size limit in bytes
//! PAL resource cache
std::list > resCache_;
- MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
- CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
- FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
+ MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
+ CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
+ FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
index c663831670..131bb4afed 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
@@ -136,7 +136,7 @@ Settings::Settings() {
subAllocationMinSize_ = 4 * Ki;
subAllocationChunkSize_ = 64 * Mi;
subAllocationMaxSize_ =
- std::min(static_cast(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
+ std::min(static_cast(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
maxCmdBuffers_ = 12;
useLightning_ = GPU_ENABLE_LC;
@@ -148,8 +148,7 @@ Settings::Settings() {
bool Settings::create(const Pal::DeviceProperties& palProp,
const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
- bool reportAsOCL12Device)
-{
+ bool reportAsOCL12Device) {
uint32_t osVer = 0x0;
// Disable thread trace by default for all devices
@@ -198,8 +197,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
case Pal::AsicRevision::Navi10Lite:
gfx10Plus_ = true;
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
- hsailExplicitXnack_ = static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
- || palProp.gpuMemoryProperties.flags.iommuv2Support);
+ hsailExplicitXnack_ =
+ static_cast(palProp.gpuMemoryProperties.flags.pageMigrationEnabled ||
+ palProp.gpuMemoryProperties.flags.iommuv2Support);
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
if (useLightning_) {
enableWave32Mode_ = true;
@@ -346,7 +346,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
if (modifyMaxWorkload.time > 0) {
- maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
+ maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
}
}
#endif // defined(_WIN32)
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
index b6e1d95441..6b8ee86768 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
@@ -39,63 +39,63 @@ class Settings : public device::Settings {
union {
struct {
- uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
- uint stagedXferRead_ : 1; //!< Uses a staged buffer read
- uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
- uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
- uint imageSupport_ : 1; //!< Report images support
- uint doublePrecision_ : 1; //!< Enables double precision support
- uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
- uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
- uint imageDMA_ : 1; //!< Enable direct image DMA transfers
- uint viPlus_ : 1; //!< VI and post VI features
- uint aiPlus_ : 1; //!< AI and post AI features
- uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
- uint threadTraceEnable_ : 1; //!< Thread trace enable
- uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
- uint useSingleScratch_ : 1; //!< Allocates single scratch per device
- uint svmAtomics_ : 1; //!< SVM device atomics
- uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
- uint useDeviceQueue_ : 1; //!< Submit to separate device queue
- uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
- uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
- uint rgpSqttForceDisable_: 1; //!< Disables SQTT
- uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
+ uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
+ uint stagedXferRead_ : 1; //!< Uses a staged buffer read
+ uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
+ uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
+ uint imageSupport_ : 1; //!< Report images support
+ uint doublePrecision_ : 1; //!< Enables double precision support
+ uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
+ uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
+ uint imageDMA_ : 1; //!< Enable direct image DMA transfers
+ uint viPlus_ : 1; //!< VI and post VI features
+ uint aiPlus_ : 1; //!< AI and post AI features
+ uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
+ uint threadTraceEnable_ : 1; //!< Thread trace enable
+ uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
+ uint useSingleScratch_ : 1; //!< Allocates single scratch per device
+ uint svmAtomics_ : 1; //!< SVM device atomics
+ uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
+ uint useDeviceQueue_ : 1; //!< Submit to separate device queue
+ uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
+ uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace
+ uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
+ uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
uint reserved_ : 11;
};
uint value_;
};
- uint oclVersion_; //!< Reported OpenCL version support
- uint debugFlags_; //!< Debug GPU flags
- uint hwLDSSize_; //!< HW local data store size
- uint maxWorkGroupSize_; //!< Requested workgroup size for this device
- uint preferredWorkGroupSize_;//!< Requested preferred workgroup size for this device
- uint workloadSplitSize_; //!< Workload split size
- uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
- uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
- uint blitEngine_; //!< Blit engine type
- uint cacheLineSize_; //!< Cache line size in bytes
- uint cacheSize_; //!< L1 cache size in bytes
- uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
- uint numDeviceEvents_; //!< The number of device events
- uint numWaitEvents_; //!< The number of wait events for device enqueue
- uint hostMemDirectAccess_; //!< Enables direct access to the host memory
- uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
- size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
- size_t stagedXferSize_; //!< Staged buffer size
- size_t pinnedXferSize_; //!< Pinned buffer size for transfer
- size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
- size_t resourceCacheSize_; //!< Resource cache size in MB
- size_t numMemDependencies_; //!< The array size for memory dependencies tracking
- uint64_t maxAllocSize_; //!< Maximum single allocation size
- uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
- uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
+ uint oclVersion_; //!< Reported OpenCL version support
+ uint debugFlags_; //!< Debug GPU flags
+ uint hwLDSSize_; //!< HW local data store size
+ uint maxWorkGroupSize_; //!< Requested workgroup size for this device
+ uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device
+ uint workloadSplitSize_; //!< Workload split size
+ uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
+ uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
+ uint blitEngine_; //!< Blit engine type
+ uint cacheLineSize_; //!< Cache line size in bytes
+ uint cacheSize_; //!< L1 cache size in bytes
+ uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
+ uint numDeviceEvents_; //!< The number of device events
+ uint numWaitEvents_; //!< The number of wait events for device enqueue
+ uint hostMemDirectAccess_; //!< Enables direct access to the host memory
+ uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
+ size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
+ size_t stagedXferSize_; //!< Staged buffer size
+ size_t pinnedXferSize_; //!< Pinned buffer size for transfer
+ size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+ size_t resourceCacheSize_; //!< Resource cache size in MB
+ size_t numMemDependencies_; //!< The array size for memory dependencies tracking
+ uint64_t maxAllocSize_; //!< Maximum single allocation size
+ uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
+ uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
+
+ uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
+ uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
+ uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
- uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
- uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
- uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
-
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
//! Default constructor
@@ -106,7 +106,7 @@ class Settings : public device::Settings {
const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings
const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings
bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device
- );
+ );
private:
//! Disable copy constructor
@@ -119,4 +119,5 @@ class Settings : public device::Settings {
void override();
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
index 97b2d5e5ca..9691fa71a2 100644
--- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
@@ -40,7 +40,7 @@ class TimeStamp : public amd::HeapObject {
Pal::IGpuMemory* iMem, //!< Buffer with the timer values
uint memOffset, //!< Offset in the buffer for the current TS
address cpuAddr //!< CPU pointer for the values in memory
- );
+ );
//! Default destructor
~TimeStamp();
@@ -114,4 +114,5 @@ class TimeStampCache : public amd::HeapObject {
uint tsOffset_; //!< Active offset in the current mem object
};
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 82e7372933..ff8bffefae 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -70,8 +70,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
if (qCreateInfo.engineType == Pal::EngineTypeExclusiveCompute) {
if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
qCreateInfo.engineIndex = it->second;
- }
- else {
+ } else {
return nullptr;
}
}
@@ -97,8 +96,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
}
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
- VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
- residency_limit, max_command_buffers);
+ VirtualGPU::Queue* queue =
+ new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = reinterpret_cast(&queue[1]);
// Create PAL queue object
@@ -163,16 +162,16 @@ VirtualGPU::Queue::~Queue() {
}
}
-Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile()
-{
- std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
+Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
+ std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
- const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
- // Find the last occurance of the '\\' character and extract the name of the application as wide char.
- const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
- const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
+ const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
+ // Find the last occurance of the '\\' character and extract the name of the application as wide
+ // char.
+ const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
+ const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
- return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
+ return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}
void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
@@ -188,8 +187,7 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
memRef.pGpuMemory = iMem;
palMemRefs_.push_back(memRef);
// Check SDI memory object
- if (iMem->Desc().flags.isExternPhys &&
- (sdiReferences_.find(iMem) == sdiReferences_.end())) {
+ if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
sdiReferences_.insert(iMem);
palSdiRefs_.push_back(iMem);
}
@@ -268,8 +266,7 @@ bool VirtualGPU::Queue::flush() {
// Submit command buffer to OS
Pal::Result result;
if (gpu_.rgpCaptureEna()) {
- result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
- iQueue_, cmdBufIdCurrent_, submitInfo);
+ result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
} else {
result = iQueue_->Submit(submitInfo);
}
@@ -383,28 +380,28 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
if (dump.is_open()) {
dump << start << " Queue: ";
switch (iQueue_->Type()) {
- case Pal::QueueTypeCompute:
- dump << "Compute";
- break;
- case Pal::QueueTypeDma:
- dump << "SDMA";
- break;
- default:
- dump << "unknown";
- break;
+ case Pal::QueueTypeCompute:
+ dump << "Compute";
+ break;
+ case Pal::QueueTypeDma:
+ dump << "SDMA";
+ break;
+ default:
+ dump << "unknown";
+ break;
}
dump << "\n"
- << "Resident memory resources:\n";
+ << "Resident memory resources:\n";
uint idx = 0;
for (auto it : memReferences_) {
dump << " " << idx << "\t[";
dump.setf(std::ios::hex, std::ios::basefield);
dump.setf(std::ios::showbase);
dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
- << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
+ << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
dump.setf(std::ios::dec);
- dump << "] CbId:" << it.second <<
- ", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n";
+ dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap
+ << "\n";
idx++;
}
@@ -414,8 +411,7 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Find if the current argument is a memory object
- if ((desc.type_ == T_POINTER) &&
- (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
+ if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
dump << " " << desc.name_ << ": " << std::endl;
}
}
@@ -519,7 +515,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
// note: The array growth shouldn't occur under the normal conditions,
// but in a case when SVM path sends the amount of SVM ptrs over
// the max size of kernel arguments
- MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
+ MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
if (nullptr == ptr) {
numMemObjectsInQueue_ = 0;
return;
@@ -527,7 +523,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
maxMemObjectsInQueue_ <<= 1;
memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
delete[] memObjectsInQueue_;
- memObjectsInQueue_= ptr;
+ memObjectsInQueue_ = ptr;
}
// Adjust the number of active objects
@@ -748,7 +744,6 @@ VirtualGPU::VirtualGPU(Device& device)
maskGroups_(1),
hsaQueueMem_(nullptr),
cmdAllocator_(nullptr) {
-
// Note: Virtual GPU device creation must be a thread safe operation
index_ = gpuDevice_.numOfVgpus_++;
gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
@@ -780,8 +775,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
createInfo.flags.autoMemoryReuse = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
- createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
- VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
+ createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
+ VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
@@ -803,8 +798,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0;
uint idx = index() % (dev().numComputeEngines() - firstQueue);
- uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
- (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
+ uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
+ ? 0
+ : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
if (dev().numComputeEngines()) {
@@ -815,9 +811,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
- queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
- cmdAllocator_, rtCUs, priority,
- residency_limit, max_cmd_buffers);
+ queues_[MainEngine] =
+ Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs,
+ priority, residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
return false;
}
@@ -832,20 +828,19 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
sdma = 1;
}
- queues_[SdmaEngine] =
- Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
- amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
- residency_limit, max_cmd_buffers);
+ queues_[SdmaEngine] = Queue::Create(
+ *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
+ amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
} else {
- queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
- idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
- residency_limit, max_cmd_buffers);
- if (nullptr == queues_[SdmaEngine]) {
- return false;
- }
+ queues_[SdmaEngine] =
+ Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
+ amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
+ if (nullptr == queues_[SdmaEngine]) {
+ return false;
+ }
}
} else {
Unimplemented();
@@ -921,7 +916,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
bool dbg_vmid = false;
state_.rgpCaptureEnabled_ = true;
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
- dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
+ dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_,
+ &dbg_vmid);
}
return true;
@@ -1511,99 +1507,99 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
bool unmapMip = false;
amd::Image* amdImage;
-{
- // Make sure VirtualGPU has an exclusive access to the resources
- amd::ScopedLock lock(execution());
+ {
+ // Make sure VirtualGPU has an exclusive access to the resources
+ amd::ScopedLock lock(execution());
- pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
- amd::Memory* owner = memory->owner();
- const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
- if (nullptr == writeMapInfo) {
- LogError("Unmap without map call");
- return;
- }
- profilingBegin(vcmd, true);
-
- // Check if image is a mipmap and assign a saved view
- amdImage = owner->asImage();
- if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
- (writeMapInfo->baseMip_ != nullptr)) {
- // Assign mip level view
- amdImage = writeMapInfo->baseMip_;
- // Clear unmap flags from the parent image
- memory->clearUnmapInfo(vcmd.mapPtr());
- memory = dev().getGpuMemory(amdImage);
- unmapMip = true;
- writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
- }
-
- // We used host memory
- if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
- if (writeMapInfo->isUnmapWrite()) {
- // Target is the backing store, so sync
- owner->signalWrite(nullptr);
- memory->syncCacheFromHost(*this);
+ pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
+ amd::Memory* owner = memory->owner();
+ const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
+ if (nullptr == writeMapInfo) {
+ LogError("Unmap without map call");
+ return;
}
- // Remove memory from VA cache
- dev().removeVACache(memory);
- }
- // data check was added for persistent memory that failed to get aperture
- // and therefore are treated like a remote resource
- else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
- memory->unmap(this);
- } else if (memory->mapMemory() != nullptr) {
- if (writeMapInfo->isUnmapWrite()) {
- amd::Coord3D srcOrigin(0, 0, 0);
- // Target is a remote resource, so copy
- assert(memory->mapMemory() != nullptr);
- if (memory->desc().buffer_) {
- if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
- writeMapInfo->origin_, writeMapInfo->region_,
- writeMapInfo->isEntire())) {
- LogError("submitUnmapMemory() - copy failed");
- vcmd.setStatus(CL_OUT_OF_RESOURCES);
- }
- } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
- Memory* memoryBuf = memory;
- amd::Coord3D origin(writeMapInfo->origin_[0]);
- amd::Coord3D size(writeMapInfo->region_[0]);
- size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
+ profilingBegin(vcmd, true);
- amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
- if (nullptr == bufferFromImage) {
- LogError("We should not fail buffer creation from image_buffer!");
+ // Check if image is a mipmap and assign a saved view
+ amdImage = owner->asImage();
+ if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
+ (writeMapInfo->baseMip_ != nullptr)) {
+ // Assign mip level view
+ amdImage = writeMapInfo->baseMip_;
+ // Clear unmap flags from the parent image
+ memory->clearUnmapInfo(vcmd.mapPtr());
+ memory = dev().getGpuMemory(amdImage);
+ unmapMip = true;
+ writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
+ }
+
+ // We used host memory
+ if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
+ if (writeMapInfo->isUnmapWrite()) {
+ // Target is the backing store, so sync
+ owner->signalWrite(nullptr);
+ memory->syncCacheFromHost(*this);
+ }
+ // Remove memory from VA cache
+ dev().removeVACache(memory);
+ }
+ // data check was added for persistent memory that failed to get aperture
+ // and therefore are treated like a remote resource
+ else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
+ memory->unmap(this);
+ } else if (memory->mapMemory() != nullptr) {
+ if (writeMapInfo->isUnmapWrite()) {
+ amd::Coord3D srcOrigin(0, 0, 0);
+ // Target is a remote resource, so copy
+ assert(memory->mapMemory() != nullptr);
+ if (memory->desc().buffer_) {
+ if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
+ writeMapInfo->origin_, writeMapInfo->region_,
+ writeMapInfo->isEntire())) {
+ LogError("submitUnmapMemory() - copy failed");
+ vcmd.setStatus(CL_OUT_OF_RESOURCES);
+ }
+ } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+ Memory* memoryBuf = memory;
+ amd::Coord3D origin(writeMapInfo->origin_[0]);
+ amd::Coord3D size(writeMapInfo->region_[0]);
+ size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
+ origin.c[0] *= elemSize;
+ size.c[0] *= elemSize;
+
+ amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
+ if (nullptr == bufferFromImage) {
+ LogError("We should not fail buffer creation from image_buffer!");
+ } else {
+ memoryBuf = dev().getGpuMemory(bufferFromImage);
+ }
+ if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
+ writeMapInfo->isEntire())) {
+ LogError("submitUnmapMemory() - copy failed");
+ vcmd.setStatus(CL_OUT_OF_RESOURCES);
+ }
+ if (nullptr != bufferFromImage) {
+ bufferFromImage->release();
+ }
} else {
- memoryBuf = dev().getGpuMemory(bufferFromImage);
- }
- if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
- writeMapInfo->isEntire())) {
- LogError("submitUnmapMemory() - copy failed");
- vcmd.setStatus(CL_OUT_OF_RESOURCES);
- }
- if (nullptr != bufferFromImage) {
- bufferFromImage->release();
- }
- } else {
- if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
- writeMapInfo->origin_, writeMapInfo->region_,
- writeMapInfo->isEntire())) {
- LogError("submitUnmapMemory() - copy failed");
- vcmd.setStatus(CL_OUT_OF_RESOURCES);
+ if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
+ writeMapInfo->origin_, writeMapInfo->region_,
+ writeMapInfo->isEntire())) {
+ LogError("submitUnmapMemory() - copy failed");
+ vcmd.setStatus(CL_OUT_OF_RESOURCES);
+ }
}
}
+ } else {
+ LogError("Unhandled unmap!");
+ vcmd.setStatus(CL_INVALID_VALUE);
}
- } else {
- LogError("Unhandled unmap!");
- vcmd.setStatus(CL_INVALID_VALUE);
+
+ // Clear unmap flags
+ memory->clearUnmapInfo(vcmd.mapPtr());
+
+ profilingEnd(vcmd);
}
-
- // Clear unmap flags
- memory->clearUnmapInfo(vcmd.mapPtr());
-
- profilingEnd(vcmd);
-}
// Release a view for a mipmap map
if (unmapMip) {
// Memory release should be outside of the execution lock,
@@ -1700,9 +1696,9 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
profilingBegin(cmd);
Memory* srcDevMem = static_cast(
- cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
+ cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* dstDevMem = static_cast(
- cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
+ cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
bool p2pAllowed = false;
#if 0
@@ -1728,16 +1724,15 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
if (p2pAllowed) {
- result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin,
- size, cmd.isEntireMemory());
- }
- else {
+ result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
+ cmd.isEntireMemory());
+ } else {
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast(
- dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
+ dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast(
- dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
-
+ dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
+
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
@@ -1750,11 +1745,11 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
amd::Coord3D cpSize(copy_size);
// Perform 2 step transfer with staging buffer
- result &= dev().xferMgr().copyBuffer(
- *srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
+ result &=
+ dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
srcOrigin.c[0] += copy_size;
- result &= dstDevMem->dev().xferMgr().copyBuffer(
- *srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize);
+ result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
+ dstOrigin, cpSize);
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
@@ -1940,10 +1935,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
}
// ================================================================================================
-void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue)
-{
- AmdAqlWrap* wraps =
- (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
+void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) {
+ AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
uint p = 0;
for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
if (wraps[i].state != 0) {
@@ -1963,11 +1956,9 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
print << "\twait_list: " << wraps[i].wait_list << "\n";
print << "\twait_num: " << wraps[i].wait_num << "\n";
uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
- size_t* events =
- reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents);
+ size_t* events = reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents);
for (j = 0; j < wraps[i].wait_num; ++j) {
- uint offs =
- static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
+ uint offs = static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
print << "Wait Event#: " << j << "\n";
print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
@@ -1980,8 +1971,8 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
print << wraps[i].aql.grid_size_z << "]\n";
HSAILKernel* child = nullptr;
- for (auto it = hsaKernel.prog().kernels().begin();
- it != hsaKernel.prog().kernels().end(); ++it) {
+ for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
+ ++it) {
if (wraps[i].aql.kernel_object == static_cast(it->second)->gpuAqlCode()) {
child = static_cast(it->second);
}
@@ -1995,7 +1986,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
- const amd::KernelSignature& signature = child->signature();
+ const amd::KernelSignature& signature = child->signature();
// Check if runtime has to setup hidden arguments
for (const auto it : signature.parameters()) {
@@ -2033,7 +2024,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
continue;
}
print << "\t" << it.name_ << ": ";
- for (int s = it.size_- 1; s >= 0; --s) {
+ for (int s = it.size_ - 1; s >= 0; --s) {
print.width(2);
print.fill('0');
print << static_cast(argum[s]);
@@ -2047,26 +2038,20 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
}
// ================================================================================================
-bool VirtualGPU::PreDeviceEnqueue(
- const amd::Kernel& kernel,
- const HSAILKernel& hsaKernel,
- VirtualGPU** gpuDefQueue,
- uint64_t* vmDefQueue)
-{
+bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
+ VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
if (nullptr == defQueue) {
LogError("Default device queue wasn't allocated");
return false;
- }
- else {
+ } else {
if (dev().settings().useDeviceQueue_) {
*gpuDefQueue = static_cast(defQueue->vDev());
if ((*gpuDefQueue)->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
- }
- else {
+ } else {
createVirtualQueue(defQueue->size());
*gpuDefQueue = this;
}
@@ -2086,15 +2071,10 @@ bool VirtualGPU::PreDeviceEnqueue(
}
// ================================================================================================
-void VirtualGPU::PostDeviceEnqueue(
- const amd::Kernel& kernel,
- const HSAILKernel& hsaKernel,
- VirtualGPU* gpuDefQueue,
- uint64_t vmDefQueue,
- uint64_t vmParentWrap,
- GpuEvent* gpuEvent)
-{
- uint32_t id = gpuEvent->id_;
+void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
+ VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
+ uint64_t vmParentWrap, GpuEvent* gpuEvent) {
+ uint32_t id = gpuEvent->id_;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
// Make sure exculsive access to the device queue
@@ -2110,16 +2090,16 @@ void VirtualGPU::PostDeviceEnqueue(
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
- dev().settings().useDeviceQueue_);
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
+ dev().settings().useDeviceQueue_);
eventEnd(MainEngine, *gpuEvent);
}
// Get the global loop start before the scheduler
Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
static_cast(gpuDefQueue->blitMgr())
- .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
- gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+ .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
+ gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
@@ -2127,8 +2107,7 @@ void VirtualGPU::PostDeviceEnqueue(
//! @note DMA flush must not occur between patch and the scheduler
Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
- SchedulerParam* param = reinterpret_cast(
- gpuDefQueue->schedParams_->data());
+ SchedulerParam* param = reinterpret_cast(gpuDefQueue->schedParams_->data());
param->signal = 1;
// Scale clock to 1024 to avoid 64 bit div in the scheduler
param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2147,8 +2126,7 @@ void VirtualGPU::PostDeviceEnqueue(
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
addVmMemory(scratchBuf);
- }
- else {
+ } else {
param->numMaxWaves = 0;
param->scratchSize = 0;
param->scratch = 0;
@@ -2162,8 +2140,8 @@ void VirtualGPU::PostDeviceEnqueue(
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
gpuDefQueue->eventBegin(MainEngine);
gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
- signalAddr, loopStart,
- gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+ signalAddr, loopStart,
+ gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
// Thus TS command for profiling has to follow in the next CB.
constexpr bool ForceSubmitFirst = true;
@@ -2173,10 +2151,10 @@ void VirtualGPU::PostDeviceEnqueue(
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
- vmParentWrap + offsetof(AmdAqlWrap, child_counter),
- signalAddr, dev().settings().useDeviceQueue_);
+ vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
+ dev().settings().useDeviceQueue_);
if (id != gpuEvent->id_) {
- LogError("Something is wrong. ID mismatch!\n");
+ LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, *gpuEvent);
}
@@ -2193,7 +2171,8 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
- if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
+ if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
+ vcmd.sharedMemBytes())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2203,10 +2182,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
- amd::Event* enqueueEvent, uint32_t sharedMemBytes)
-{
- size_t newOffset[3] = { 0, 0, 0 };
- size_t newGlobalSize[3] = { 0, 0, 0 };
+ amd::Event* enqueueEvent, uint32_t sharedMemBytes) {
+ size_t newOffset[3] = {0, 0, 0};
+ size_t newGlobalSize[3] = {0, 0, 0};
int dim = -1;
int iteration = 1;
@@ -2221,17 +2199,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// If RGP capturing is enabled, then start SQTT trace
if (rgpCaptureEna()) {
- size_t newLocalSize[3] = { 1, 1, 1 };
+ size_t newLocalSize[3] = {1, 1, 1};
for (uint i = 0; i < sizes.dimensions(); i++) {
if (sizes.local()[i] != 0) {
newLocalSize[i] = sizes.local()[i];
}
}
- dev().rgpCaptureMgr()->PreDispatch(this, hsaKernel,
- // Report global size in workgroups, since that's the RGP trace semantics
- newGlobalSize[0] / newLocalSize[0],
- newGlobalSize[1] / newLocalSize[1],
- newGlobalSize[2] / newLocalSize[2]);
+ dev().rgpCaptureMgr()->PreDispatch(
+ this, hsaKernel,
+ // Report global size in workgroups, since that's the RGP trace semantics
+ newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
+ newGlobalSize[2] / newLocalSize[2]);
}
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
@@ -2257,8 +2235,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
- LogError("Wrong memory objects!");
- return false;
+ LogError("Wrong memory objects!");
+ return false;
}
bool needFlush = false;
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
@@ -2305,15 +2283,14 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// an extra loop is required.
const amd::KernelParameters& kernelParams = kernel.parameters();
amd::Memory* const* memories =
- reinterpret_cast(parameters + kernelParams.memoryObjOffset());
+ reinterpret_cast(parameters + kernelParams.memoryObjOffset());
for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
if (nativeMem) {
Memory* gpuMem = reinterpret_cast(memories)[i];
if (gpuMem != nullptr) {
gpuMem->setBusy(*this, gpuEvent);
}
- }
- else {
+ } else {
amd::Memory* mem = memories[i];
if (mem != nullptr) {
dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
@@ -2325,7 +2302,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
- *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
+ *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2348,8 +2325,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
- dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
- enqueueEvent->profilingInfo().waves_ : 0;
+ dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
@@ -2660,7 +2636,6 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
eventEnd(MainEngine, gpuEvent);
} else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
-
EngineType activeEngineID = engineID_;
engineID_ = static_cast(pGpuMemory->getGpuEvent(*this)->engineId_);
@@ -2669,8 +2644,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
// Workarounds: We had systems where an extra delay was necessary.
{
- // Flush CB associated with the DGMA buffer
- isDone(pGpuMemory->getGpuEvent(*this));
+ // Flush CB associated with the DGMA buffer
+ isDone(pGpuMemory->getGpuEvent(*this));
}
eventBegin(engineID_);
@@ -2711,10 +2686,11 @@ void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd
pGpuMems[i] = pGpuMemory->iMem();
}
- dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim);
+ dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
+ Pal::GpuMemoryRefCantTrim);
dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
if (numObjects != 0) {
- dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
+ dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
}
for (uint i = 0; i < numObjects; i++) {
@@ -3104,8 +3080,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
break;
}
// get svm non arugment information
- void* const* svmPtrArray = reinterpret_cast(
- params + kernelParams.getExecInfoOffset());
+ void* const* svmPtrArray =
+ reinterpret_cast(params + kernelParams.getExecInfoOffset());
for (size_t i = 0; i < count; i++) {
amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
if (nullptr == memory) {
@@ -3149,8 +3125,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
bool srdResource = false;
amd::Memory* const* memories =
reinterpret_cast(params + kernelParams.memoryObjOffset());
- const HSAILKernel& hsaKernel =
- static_cast(*(kernel.getDeviceKernel(dev())));
+ const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev())));
const amd::KernelSignature& signature = kernel.signature();
ldsAddress = hsaKernel.ldsSize();
@@ -3225,10 +3200,10 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
addVmMemory(gpuMem);
const void* globalAddress = *reinterpret_cast(params + desc.offset_);
LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
- desc.typeName_.c_str(), desc.name_.c_str(),
- globalAddress, reinterpret_cast(gpuMem->vmAddress()),
- reinterpret_cast(gpuMem->vmAddress() + gpuMem->size()),
- std::this_thread::get_id());
+ desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
+ reinterpret_cast(gpuMem->vmAddress()),
+ reinterpret_cast(gpuMem->vmAddress() + gpuMem->size()),
+ std::this_thread::get_id());
//! Check if compiler expects read/write.
//! Note: SVM with subbuffers has an issue with tracking.
@@ -3255,30 +3230,28 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
if (gpuMem->desc().isDoppTexture_) {
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
- kernel.parameters().getExecPfpaVcop());
+ kernel.parameters().getExecPfpaVcop());
}
}
}
}
- }
- else if (desc.type_ == T_VOID) {
+ } else if (desc.type_ == T_VOID) {
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
// Copy the current structure into CB1
- size_t gpuPtr = static_cast(cb(1)->UploadDataToHw(
- params + desc.offset_, desc.size_));
+ size_t gpuPtr =
+ static_cast(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
// Then use a pointer in aqlArgBuffer to CB1
const auto it = hsaKernel.patch().find(desc.offset_);
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast(params), &gpuPtr, sizeof(size_t), it->second);
addVmMemory(cb(1)->ActiveMemory());
}
- }
- else if (desc.type_ == T_SAMPLER) {
+ } else if (desc.type_ == T_SAMPLER) {
srdResource = true;
} else if (desc.type_ == T_QUEUE) {
uint32_t index = desc.info_.arrayIndex_;
- const amd::DeviceQueue* queue = reinterpret_cast(
- params + kernelParams.queueObjOffset())[index];
+ const amd::DeviceQueue* queue =
+ reinterpret_cast(params + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 13c83b3796..9e557e1f03 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -51,17 +51,18 @@ class VirtualGPU : public device::VirtualDevice {
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
- static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
- Pal::QueueType queueType, //!< PAL queue type
- uint engineIdx, //!< Select particular engine index
- Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
- uint rtCU, //!< The number of reserved CUs
- amd::CommandQueue::Priority priority, //!< Queue priority
- uint64_t residency_limit, //!< Enables residency limit
- uint max_command_buffers //!< Number of allocated command buffers
- );
+ static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
+ Pal::QueueType queueType, //!< PAL queue type
+ uint engineIdx, //!< Select particular engine index
+ Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
+ uint rtCU, //!< The number of reserved CUs
+ amd::CommandQueue::Priority priority, //!< Queue priority
+ uint64_t residency_limit, //!< Enables residency limit
+ uint max_command_buffers //!< Number of allocated command buffers
+ );
- Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
+ Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
+ uint max_command_buffers)
: iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
@@ -75,8 +76,7 @@ class VirtualGPU : public device::VirtualDevice {
vlAlloc_(64 * Ki),
residency_size_(0),
residency_limit_(residency_limit),
- max_command_buffers_(max_command_buffers)
- {
+ max_command_buffers_(max_command_buffers) {
vlAlloc_.Init();
}
@@ -100,8 +100,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::Result UpdateAppPowerProfile();
// ibReuse forces event wait without polling, to make sure event occured
- template
- bool waifForFence(uint cbId) const {
+ template bool waifForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
uint64_t start;
uint64_t end;
@@ -138,8 +137,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
- template
- uint submit(bool forceFlush);
+ template uint submit(bool forceFlush);
bool flush();
@@ -151,28 +149,28 @@ class VirtualGPU : public device::VirtualDevice {
uint cmdBufId() const { return cmdBufIdCurrent_; }
- Pal::IQueue* iQueue_; //!< PAL queue object
- std::vector iCmdBuffs_; //!< PAL command buffers
- std::vector iCmdFences_; //!< PAL fences, associated with CMD
- const amd::Kernel* last_kernel_; //!< Last submitted kernel
+ Pal::IQueue* iQueue_; //!< PAL queue object
+ std::vector iCmdBuffs_; //!< PAL command buffers
+ std::vector iCmdFences_; //!< PAL fences, associated with CMD
+ const amd::Kernel* last_kernel_; //!< Last submitted kernel
- private:
+ private:
void DumpMemoryReferences() const;
- const VirtualGPU& gpu_; //!< OCL virtual GPU object
- Pal::IDevice* iDev_; //!< PAL device
- uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
- uint cmdBufIdCurrent_; //!< Current global command buffer ID
- uint cmbBufIdRetired_; //!< The last retired command buffer ID
- uint cmdCnt_; //!< Counter of commands
+ const VirtualGPU& gpu_; //!< OCL virtual GPU object
+ Pal::IDevice* iDev_; //!< PAL device
+ uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
+ uint cmdBufIdCurrent_; //!< Current global command buffer ID
+ uint cmbBufIdRetired_; //!< The last retired command buffer ID
+ uint cmdCnt_; //!< Counter of commands
std::unordered_map memReferences_;
- Util::VirtualLinearAllocator vlAlloc_;
- std::vector palMemRefs_;
- std::vector palMems_;
- std::vector palDoppRefs_;
- std::set sdiReferences_;
- std::vector palSdiRefs_;
- uint64_t residency_size_; //!< Resource residency size
- uint64_t residency_limit_; //!< Enables residency limit
+ Util::VirtualLinearAllocator vlAlloc_;
+ std::vector palMemRefs_;
+ std::vector palMems_;
+ std::vector palDoppRefs_;
+ std::set sdiReferences_;
+ std::vector palSdiRefs_;
+ uint64_t residency_size_; //!< Resource residency size
+ uint64_t residency_limit_; //!< Enables residency limit
uint max_command_buffers_;
};
@@ -185,14 +183,14 @@ class VirtualGPU : public device::VirtualDevice {
CommandBatch(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
- ) {
+ ) {
init(head, events, lastTS);
}
void init(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
- ) {
+ ) {
head_ = head;
lastTS_ = lastTS;
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
@@ -202,11 +200,11 @@ class VirtualGPU : public device::VirtualDevice {
//! The virtual GPU states
union State {
struct {
- uint profiling_ : 1; //!< Profiling is enabled
- uint forceWait_ : 1; //!< Forces wait in flush()
- uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
- uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
- uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
+ uint profiling_ : 1; //!< Profiling is enabled
+ uint forceWait_ : 1; //!< Forces wait in flush()
+ uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
+ uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
+ uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
};
uint value_;
State() : value_(0) {}
@@ -259,13 +257,13 @@ class VirtualGPU : public device::VirtualDevice {
void findSplitSize(const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
- );
+ );
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
- );
+ );
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
@@ -301,7 +299,7 @@ class VirtualGPU : public device::VirtualDevice {
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
uint32_t sharedMemBytes = 0 //!< Shared memory size
- );
+ );
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -331,20 +329,20 @@ class VirtualGPU : public device::VirtualDevice {
//! Set the last known GPU event
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
- );
+ );
//! Flush DMA buffer on the specified engine
void flushDMA(uint engineID //!< Engine ID for DMA flush
- );
+ );
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch
- );
+ );
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(CommandBatch* cb //!< Command batch
- );
+ );
//! Returns a resource associated with the constant buffer
const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
@@ -355,7 +353,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
- );
+ );
//! End the command profiling
void profilingEnd(amd::Command& command);
@@ -363,11 +361,11 @@ class VirtualGPU : public device::VirtualDevice {
//! Collect the profiling results
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
- );
+ );
//! Adds a memory handle into the GSL memory array for Virtual Heap
inline void addVmMemory(const Memory* memory //!< GPU memory object
- );
+ );
//! Adds the last submitted kernel to the queue for tracking a possible hang
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
@@ -377,7 +375,7 @@ class VirtualGPU : public device::VirtualDevice {
void addDoppRef(const Memory* memory, //!< GPU memory object
bool lastDoopCmd, //!< is the last submission for the pre-present primary
bool pfpaDoppCmd //!< is a submission for the pre-present primary
- );
+ );
//! Return xfer buffer for staging operations
XferBuffer& xferWrite() { return writeBuffer_; }
@@ -429,7 +427,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
- );
+ );
EngineType engineID_; //!< Engine ID for this VirtualGPU
@@ -447,7 +445,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
- void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
+ void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
+ bool flushL2 = false) const {
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -508,7 +507,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns TRUE if SDMA requires overlap synchronizaiton
bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer
const Resource& dst //!< Destination resource for SDMA transfer
- );
+ );
//! Checks if RGP capture is enabled
bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
@@ -519,7 +518,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
- );
+ );
private:
struct MemoryRange {
@@ -537,14 +536,14 @@ class VirtualGPU : public device::VirtualDevice {
//! Awaits a command batch with a waiting event
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = nullptr //!< A waiting event
- );
+ );
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
- size_t& ldsAddess //!< Returns LDS size, used in the kernel
- );
+ size_t& ldsAddess //!< Returns LDS size, used in the kernel
+ );
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
@@ -553,7 +552,7 @@ class VirtualGPU : public device::VirtualDevice {
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
- );
+ );
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
@@ -564,35 +563,36 @@ class VirtualGPU : public device::VirtualDevice {
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
- );
+ );
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
- );
+ );
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
- );
+ );
void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel
VirtualGPU* gpuDefQueue //!< Device queue for children execution
- );
+ );
- bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
- const HSAILKernel& hsaKernel, //!< Parent HSAIL object
- VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
- uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
- );
+ bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
+ const HSAILKernel& hsaKernel, //!< Parent HSAIL object
+ VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
+ uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
+ );
- void PostDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
- const HSAILKernel& hsaKernel, //!< Parent HSAIL object
- VirtualGPU* gpuDefQueue, //!< GPU default queue
- uint64_t vmDefQueue, //!< VM handle to the virtual queue
- uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
- GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
- );
+ void PostDeviceEnqueue(
+ const amd::Kernel& kernel, //!< Parent amd kernel object
+ const HSAILKernel& hsaKernel, //!< Parent HSAIL object
+ VirtualGPU* gpuDefQueue, //!< GPU default queue
+ uint64_t vmDefQueue, //!< VM handle to the virtual queue
+ uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
+ GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
+ );
Device& gpuDevice_; //!< physical GPU device
amd::Monitor execution_; //!< Lock to serialise access to all device objects
@@ -605,11 +605,11 @@ class VirtualGPU : public device::VirtualDevice {
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
- std::vector pinnedMems_; //!< Pinned memory list
+ std::vector pinnedMems_; //!< Pinned memory list
- ManagedBuffer managedBuffer_; //!< Managed write buffer
- constbufs_t constBufs_; //!< constant buffers
- XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
+ ManagedBuffer managedBuffer_; //!< Managed write buffer
+ constbufs_t constBufs_; //!< constant buffers
+ XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
typedef std::queue CommandBatchQueue;
CommandBatchQueue cbQueue_; //!< Queue of command batches
@@ -617,12 +617,12 @@ class VirtualGPU : public device::VirtualDevice {
uint hwRing_; //!< HW ring used on this virtual device
- State state_; //!< virtual GPU current state
+ State state_; //!< virtual GPU current state
GpuEvent events_[AllEngines]; //!< Last known GPU events
- uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
- TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
- TimeStamp* profileTs_; //!< current profiling timestamp for command
+ uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
+ TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
+ TimeStamp* profileTs_; //!< current profiling timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
@@ -645,8 +645,7 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
queues_[MainEngine]->last_kernel_ = &kernel;
}
-template
-uint VirtualGPU::Queue::submit(bool forceFlush) {
+template uint VirtualGPU::Queue::submit(bool forceFlush) {
cmdCnt_++;
uint id = cmdBufIdCurrent_;
bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
@@ -659,32 +658,30 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
}
template
-inline void WriteAqlArgAt(
- unsigned char* dst, //!< The write pointer to the buffer
- const T* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- size_t offset //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
+ const T* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
) {
memcpy(dst + offset, src, size);
}
template <>
-inline void WriteAqlArgAt(
- unsigned char* dst, //!< The write pointer to the buffer
- const uint32_t* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- size_t offset //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
+ const uint32_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast(dst + offset)) = *src;
}
template <>
-inline void WriteAqlArgAt(
- unsigned char* dst, //!< The write pointer to the buffer
- const uint64_t* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- size_t offset //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
+ const uint64_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast(dst + offset)) = *src;
}
-/*@}*/} // namespace pal
+/*@}*/ // namespace pal
+} // namespace pal