P4 to Git Change 1780358 by gandryey@gera-win10 on 2019/05/08 18:46:22
SWDEV-79445 - OCL generic changes and code clean-up
- Run google autoformat over the PAL backend. It will allow to enable autoformat in VS for the future changes.
- No functional changes
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#78 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#93 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#73 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#60 edit
[ROCm/clr commit: 699a12bfa2]
Этот коммит содержится в:
@@ -11,8 +11,9 @@ namespace pal {
|
||||
|
||||
AppProfile::AppProfile()
|
||||
: amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
|
||||
propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
|
||||
propertyDataMap_.insert(
|
||||
{"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
|
||||
|
||||
propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
|
||||
}
|
||||
}
|
||||
} // namespace pal
|
||||
|
||||
@@ -20,4 +20,4 @@ class AppProfile : public amd::AppProfile {
|
||||
bool enableHighPerformanceState_;
|
||||
bool reportAsOCL12Device_;
|
||||
};
|
||||
}
|
||||
} // namespace pal
|
||||
|
||||
@@ -280,8 +280,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
amd::Coord3D copySize(tmpSize, 0, 0);
|
||||
|
||||
// Copy data into the temporary buffer, using CPU
|
||||
if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset,
|
||||
src, copySize, flags)) {
|
||||
if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset, src, copySize,
|
||||
flags)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -296,7 +296,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
srcOffset += tmpSize;
|
||||
if ((srcOffset + tmpSize) > gpu().xferWrite().MaxSize()) {
|
||||
srcOffset = 0;
|
||||
flags = 0;
|
||||
flags = 0;
|
||||
} else {
|
||||
flags = Resource::NoWait;
|
||||
}
|
||||
@@ -310,7 +310,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBuffer_ ||
|
||||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
gpuMem(dstMemory).isPersistentDirectMap()) {
|
||||
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
|
||||
} else {
|
||||
@@ -335,7 +335,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
// Copy memory, using pinning
|
||||
while (dstSize > 0) {
|
||||
size_t tmpSize;
|
||||
// If it's the first iterarion, then readjust the copy size
|
||||
// If it's the first iterarion, then readjust the copy size
|
||||
// to include alignment
|
||||
if (first) {
|
||||
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
|
||||
@@ -398,7 +398,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBufferRect_ ||
|
||||
(dstMemory.isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
gpuMem(dstMemory).isPersistentDirectMap()) {
|
||||
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
|
||||
} else {
|
||||
@@ -586,8 +586,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
entire, rowPitch, slicePitch);
|
||||
} else {
|
||||
// Use PAL path for a transfer
|
||||
result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin,
|
||||
size, gpuMem(dstMemory));
|
||||
result =
|
||||
gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
|
||||
|
||||
// Check if a HostBlit transfer is required
|
||||
if (completeOperation_ && !result) {
|
||||
@@ -947,8 +947,8 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
|
||||
void* param = kernel->parameters().values() + desc.offset_;
|
||||
assert((desc.type_ == T_POINTER || value != NULL ||
|
||||
(desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
|
||||
"not a valid local mem arg");
|
||||
(desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
|
||||
"not a valid local mem arg");
|
||||
|
||||
uint32_t uint32_value = 0;
|
||||
uint64_t uint64_value = 0;
|
||||
@@ -957,14 +957,15 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
|
||||
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
|
||||
nullptr;
|
||||
} else {
|
||||
// convert cl_mem to amd::Memory*, return false if invalid.
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
|
||||
*static_cast<Memory* const*>(value))->virtualAddress());
|
||||
LP64_SWITCH(uint32_value, uint64_value) =
|
||||
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
|
||||
*static_cast<Memory* const*>(value);
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
|
||||
*static_cast<Memory* const*>(value);
|
||||
// Note: Special case for image SRD, which is 64 bit always
|
||||
if (LP64_SWITCH(true, false) &&
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
|
||||
@@ -1018,8 +1019,8 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
|
||||
bool releaseView = false;
|
||||
bool result = false;
|
||||
amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_);
|
||||
bool swapLayer = (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
|
||||
dev().settings().gfx10Plus_;
|
||||
bool swapLayer =
|
||||
(dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
|
||||
|
||||
// Find unsupported formats
|
||||
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
||||
@@ -1078,10 +1079,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
|
||||
// Swap the Y and Z components, apparently gfx10 HW expects
|
||||
// layer in Z
|
||||
if (swapLayer) {
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
}
|
||||
} else {
|
||||
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
||||
@@ -1114,10 +1115,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
|
||||
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
|
||||
|
||||
if (swapLayer) {
|
||||
dstOrg[2] = dstOrg[1];
|
||||
dstOrg[1] = 0;
|
||||
copySize[2] = copySize[1];
|
||||
copySize[1] = 1;
|
||||
dstOrg[2] = dstOrg[1];
|
||||
dstOrg[1] = 0;
|
||||
copySize[2] = copySize[1];
|
||||
copySize[1] = 1;
|
||||
}
|
||||
|
||||
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
|
||||
@@ -1338,8 +1339,8 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
|
||||
bool releaseView = false;
|
||||
bool result = false;
|
||||
amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
|
||||
bool swapLayer = (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
|
||||
dev().settings().gfx10Plus_;
|
||||
bool swapLayer =
|
||||
(srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
|
||||
|
||||
// Find unsupported formats
|
||||
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
||||
@@ -1398,10 +1399,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
|
||||
// Swap the Y and Z components, apparently gfx10 HW expects
|
||||
// layer in Z
|
||||
if (swapLayer) {
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
}
|
||||
} else {
|
||||
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
||||
@@ -1426,10 +1427,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
|
||||
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
|
||||
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
|
||||
if (swapLayer) {
|
||||
srcOrg[2] = srcOrg[1];
|
||||
srcOrg[1] = 0;
|
||||
copySize[2] = copySize[1];
|
||||
copySize[1] = 1;
|
||||
srcOrg[2] = srcOrg[1];
|
||||
srcOrg[1] = 0;
|
||||
copySize[2] = copySize[1];
|
||||
copySize[1] = 1;
|
||||
}
|
||||
setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
|
||||
uint32_t memFmtSize = gpuMem(srcMemory).elementSize();
|
||||
@@ -1570,7 +1571,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
|
||||
// Program source origin
|
||||
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
|
||||
if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
|
||||
dev().settings().gfx10Plus_) {
|
||||
dev().settings().gfx10Plus_) {
|
||||
srcOrg[3] = 1;
|
||||
}
|
||||
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
|
||||
@@ -1578,7 +1579,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
|
||||
// Program destinaiton origin
|
||||
cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0};
|
||||
if ((gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
|
||||
dev().settings().gfx10Plus_) {
|
||||
dev().settings().gfx10Plus_) {
|
||||
dstOrg[3] = 1;
|
||||
}
|
||||
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
|
||||
@@ -1700,16 +1701,15 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
|
||||
amdMemory = pinHostMemory(srcHost, pinSize, partial);
|
||||
if (amdMemory == nullptr) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::writeImage(srcHost, dstMemory,
|
||||
origin, size, rowPitch, slicePitch, entire);
|
||||
result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
|
||||
entire);
|
||||
synchronize();
|
||||
return result;
|
||||
}
|
||||
// Get device memory for this virtual device
|
||||
srcMemory = dev().getGpuMemory(amdMemory);
|
||||
pinned = true;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
srcMemory = &gpu().xferWrite().Acquire(pinSize);
|
||||
srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait);
|
||||
pinned = false;
|
||||
@@ -1951,7 +1951,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBuffer_ ||
|
||||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
(gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
|
||||
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
|
||||
synchronize();
|
||||
@@ -2002,7 +2002,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBufferRect_ ||
|
||||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
gpuMem(dstMemory).isPersistentDirectMap()) {
|
||||
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
|
||||
synchronize();
|
||||
@@ -2206,8 +2206,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
size_t localWorkSize[3];
|
||||
Memory* memView = &gpuMem(memory);
|
||||
amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
|
||||
bool swapLayer = (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
|
||||
dev().settings().gfx10Plus_;
|
||||
bool swapLayer =
|
||||
(memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
|
||||
|
||||
// Program the kernels workload depending on the fill dimensions
|
||||
fillType = FillImage;
|
||||
@@ -2274,10 +2274,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
// Swap the Y and Z components, apparently gfx10 HW expects
|
||||
// layer in Z
|
||||
if (swapLayer) {
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
globalWorkSize[2] = globalWorkSize[1];
|
||||
globalWorkSize[1] = 1;
|
||||
localWorkSize[2] = localWorkSize[1];
|
||||
localWorkSize[1] = 1;
|
||||
}
|
||||
} else {
|
||||
globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
|
||||
@@ -2297,10 +2297,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0};
|
||||
cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
|
||||
if (swapLayer) {
|
||||
fillOrigin[2] = fillOrigin[1];
|
||||
fillOrigin[1] = 0;
|
||||
fillSize[2] = fillSize[1];
|
||||
fillSize[1] = 1;
|
||||
fillOrigin[2] = fillOrigin[1];
|
||||
fillOrigin[1] = 0;
|
||||
fillSize[2] = fillSize[1];
|
||||
fillSize[1] = 1;
|
||||
}
|
||||
setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
|
||||
setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
|
||||
|
||||
@@ -27,7 +27,7 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
//! Constructor
|
||||
DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~DmaBlitManager() {}
|
||||
@@ -211,7 +211,7 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
//! Constructor
|
||||
KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~KernelBlitManager();
|
||||
@@ -382,7 +382,7 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
|
||||
//! Creates a program for all blit operations
|
||||
bool createProgram(Device& device //!< Device object
|
||||
);
|
||||
);
|
||||
|
||||
//! Creates a view memory object
|
||||
Memory* createView(const Memory& parent, //!< Parent memory object
|
||||
@@ -409,4 +409,5 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"fillImage", "scheduler",
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -11,12 +11,12 @@ namespace pal {
|
||||
|
||||
// ================================================================================================
|
||||
ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
|
||||
: gpu_(gpu)
|
||||
, pool_(MaxNumberOfBuffers)
|
||||
, activeBuffer_(0)
|
||||
, size_(size)
|
||||
, wrtOffset_(0)
|
||||
, wrtAddress_(nullptr) {}
|
||||
: gpu_(gpu),
|
||||
pool_(MaxNumberOfBuffers),
|
||||
activeBuffer_(0),
|
||||
size_(size),
|
||||
wrtOffset_(0),
|
||||
wrtAddress_(nullptr) {}
|
||||
|
||||
// ================================================================================================
|
||||
void ManagedBuffer::release() {
|
||||
@@ -40,8 +40,8 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
|
||||
pool_[i].buf->memRef()->gpu_ = &gpu_;
|
||||
void* wrtAddress = pool_[i].buf->map(&gpu_);
|
||||
if (wrtAddress == nullptr) {
|
||||
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
|
||||
return false;
|
||||
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
|
||||
return false;
|
||||
}
|
||||
// Make sure OCL touches every buffer in the queue to avoid delays on the first submit
|
||||
uint dummy = 0;
|
||||
@@ -94,15 +94,10 @@ void ManagedBuffer::pinGpuEvent() {
|
||||
|
||||
// ================================================================================================
|
||||
ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
|
||||
: mbuf_(mbuf)
|
||||
, sys_mem_copy_(nullptr)
|
||||
, size_(size)
|
||||
{}
|
||||
: mbuf_(mbuf), sys_mem_copy_(nullptr), size_(size) {}
|
||||
|
||||
// ================================================================================================
|
||||
ConstantBuffer::~ConstantBuffer() {
|
||||
amd::AlignedMemory::deallocate(sys_mem_copy_);
|
||||
}
|
||||
ConstantBuffer::~ConstantBuffer() { amd::AlignedMemory::deallocate(sys_mem_copy_); }
|
||||
|
||||
// ================================================================================================
|
||||
bool ConstantBuffer::Create() {
|
||||
@@ -118,8 +113,8 @@ bool ConstantBuffer::Create() {
|
||||
|
||||
// ================================================================================================
|
||||
uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
|
||||
uint64_t vm_address;
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
uint64_t vm_address;
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
// Update memory with new CB data
|
||||
memcpy(cpu_address, sys_mem_copy_, size);
|
||||
return vm_address;
|
||||
@@ -127,8 +122,8 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
|
||||
|
||||
// ================================================================================================
|
||||
uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
|
||||
uint64_t vm_address;
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
uint64_t vm_address;
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
// Update memory with new CB data
|
||||
memcpy(cpu_address, sysmem, size);
|
||||
return vm_address;
|
||||
@@ -136,9 +131,7 @@ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const
|
||||
|
||||
// ================================================================================================
|
||||
XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
|
||||
: buffer_view_(device, size)
|
||||
, mbuf_(mbuf)
|
||||
, size_(size) {
|
||||
: buffer_view_(device, size), mbuf_(mbuf), size_(size) {
|
||||
// Create a view for access
|
||||
Resource::ViewParams params = {};
|
||||
params.gpu_ = &mbuf_.gpu();
|
||||
@@ -151,9 +144,9 @@ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
|
||||
|
||||
// ================================================================================================
|
||||
Memory& XferBuffer::Acquire(uint32_t size) {
|
||||
uint64_t vm_address;
|
||||
uint64_t vm_address;
|
||||
// Reserve space in the managed buffer
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
address cpu_address = mbuf_.reserve(size, &vm_address);
|
||||
// Update a view for access
|
||||
buffer_view_.updateView(mbuf_.activeMemory(), vm_address - mbuf_.vmAddress(), size);
|
||||
return buffer_view_;
|
||||
|
||||
@@ -12,9 +12,9 @@ namespace pal {
|
||||
class ManagedBuffer : public amd::EmbeddedObject {
|
||||
public:
|
||||
//! Constructor for the ConstBuffer class
|
||||
ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
uint32_t size //!< size of the managed buffers in bytes
|
||||
);
|
||||
ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
uint32_t size //!< size of the managed buffers in bytes
|
||||
);
|
||||
~ManagedBuffer() {}
|
||||
|
||||
//! Creates the managed buffers
|
||||
@@ -50,8 +50,8 @@ class ManagedBuffer : public amd::EmbeddedObject {
|
||||
|
||||
private:
|
||||
struct TimeStampedBuffer {
|
||||
Memory* buf;
|
||||
GpuEvent events[AllEngines];
|
||||
Memory* buf;
|
||||
GpuEvent events[AllEngines];
|
||||
};
|
||||
|
||||
//! The maximum number of the managed buffers
|
||||
@@ -63,21 +63,21 @@ class ManagedBuffer : public amd::EmbeddedObject {
|
||||
//! Disable operator=
|
||||
ManagedBuffer& operator=(const ManagedBuffer&) = delete;
|
||||
|
||||
VirtualGPU& gpu_; //!< Virtual GPU object
|
||||
std::vector<TimeStampedBuffer> pool_; //!< Buffers for management
|
||||
uint32_t activeBuffer_; //!< Current active buffer
|
||||
uint32_t size_; //!< Constant buffer size
|
||||
uint32_t wrtOffset_; //!< Current write offset
|
||||
address wrtAddress_; //!< Write address in CB
|
||||
VirtualGPU& gpu_; //!< Virtual GPU object
|
||||
std::vector<TimeStampedBuffer> pool_; //!< Buffers for management
|
||||
uint32_t activeBuffer_; //!< Current active buffer
|
||||
uint32_t size_; //!< Constant buffer size
|
||||
uint32_t wrtOffset_; //!< Current write offset
|
||||
address wrtAddress_; //!< Write address in CB
|
||||
};
|
||||
|
||||
//! Constant buffer
|
||||
class ConstantBuffer : public amd::HeapObject {
|
||||
public:
|
||||
public:
|
||||
//! Constructor for the ConstBuffer class
|
||||
ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer
|
||||
uint32_t size //!< Max size of the constant buffer
|
||||
);
|
||||
uint32_t size //!< Max size of the constant buffer
|
||||
);
|
||||
|
||||
//! Destructor for the ConstBuffer class
|
||||
~ConstantBuffer();
|
||||
@@ -86,18 +86,18 @@ public:
|
||||
bool Create();
|
||||
|
||||
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
|
||||
*
|
||||
* \return GPU address for the uploaded data
|
||||
*/
|
||||
*
|
||||
* \return GPU address for the uploaded data
|
||||
*/
|
||||
uint64_t UploadDataToHw(uint32_t size //!< real data size for upload
|
||||
) const;
|
||||
|
||||
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
|
||||
*
|
||||
* \return GPU address for the uploaded data
|
||||
*/
|
||||
*
|
||||
* \return GPU address for the uploaded data
|
||||
*/
|
||||
uint64_t UploadDataToHw(const void* sysmem, //!< Pointer to the data for upload
|
||||
uint32_t size //!< Real data size for upload
|
||||
uint32_t size //!< Real data size for upload
|
||||
) const;
|
||||
|
||||
//! Returns a pointer to the system memory copy for CB
|
||||
@@ -106,52 +106,55 @@ public:
|
||||
//! Returns active GPU buffer
|
||||
Memory* ActiveMemory() const { return mbuf_.activeMemory(); }
|
||||
|
||||
private:
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
ConstantBuffer(const ConstantBuffer&) = delete;
|
||||
|
||||
//! Disable operator=
|
||||
ConstantBuffer& operator=(const ConstantBuffer&) = delete;
|
||||
|
||||
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
|
||||
address sys_mem_copy_; //!< System memory copy
|
||||
uint32_t size_; //!< Constant buffer size
|
||||
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
|
||||
address sys_mem_copy_; //!< System memory copy
|
||||
uint32_t size_; //!< Constant buffer size
|
||||
};
|
||||
|
||||
//! Staging buffer
|
||||
class XferBuffer : public amd::EmbeddedObject {
|
||||
public:
|
||||
public:
|
||||
//! Constructor for the ConstBuffer class
|
||||
XferBuffer(const Device& device, //!< Active GPU device
|
||||
XferBuffer(const Device& device, //!< Active GPU device
|
||||
ManagedBuffer& mbuf, //!< Managed buffer
|
||||
uint32_t size //!< Maximum size of the transfer buffer
|
||||
uint32_t size //!< Maximum size of the transfer buffer
|
||||
);
|
||||
|
||||
//! Destructor for the ConstBuffer class
|
||||
~XferBuffer() {}
|
||||
|
||||
/*! \brief Acquires free memory from the managed buffer
|
||||
*
|
||||
* \return GPU memory object associated with free memory
|
||||
*/
|
||||
Memory& Acquire(uint32_t size //!< data size for transfers
|
||||
);
|
||||
*
|
||||
* \return GPU memory object associated with free memory
|
||||
*/
|
||||
Memory& Acquire(uint32_t size //!< data size for transfers
|
||||
);
|
||||
|
||||
//! Releases memory object used in the staging transfer
|
||||
void Release(Memory& mem //!< Memory object for release
|
||||
) { buffer_view_.updateView(nullptr, 0, 0); }
|
||||
) {
|
||||
buffer_view_.updateView(nullptr, 0, 0);
|
||||
}
|
||||
|
||||
size_t MaxSize() const { return static_cast<size_t>(size_); }
|
||||
|
||||
private:
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
XferBuffer(const XferBuffer&) = delete;
|
||||
|
||||
//! Disable operator=
|
||||
XferBuffer& operator=(const XferBuffer&) = delete;
|
||||
|
||||
Memory buffer_view_; //!< Buffer view returned in the acquire
|
||||
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
|
||||
uint32_t size_; //!< Mx staging buffer size
|
||||
Memory buffer_view_; //!< Buffer view returned in the acquire
|
||||
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
|
||||
uint32_t size_; //!< Mx staging buffer size
|
||||
};
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -676,12 +676,12 @@ void PerfCounter::convertInfo() {
|
||||
break;
|
||||
case Pal::GfxIpLevel::GfxIp10:
|
||||
case Pal::GfxIpLevel::GfxIp10_1:
|
||||
if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
|
||||
auto p = gfx10BlockIdPal[info_.blockIndex_];
|
||||
info_.blockIndex_ = std::get<0>(p);
|
||||
info_.counterIndex_ = std::get<1>(p);
|
||||
}
|
||||
break;
|
||||
if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
|
||||
auto p = gfx10BlockIdPal[info_.blockIndex_];
|
||||
info_.blockIndex_ = std::get<0>(p);
|
||||
info_.counterIndex_ = std::get<1>(p);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
Unimplemented();
|
||||
break;
|
||||
|
||||
@@ -84,8 +84,7 @@ class PerfCounter : public device::PerfCounter {
|
||||
cl_uint blockIndex, //!< HW block index
|
||||
cl_uint counterIndex, //!< Counter index within the block
|
||||
cl_uint eventIndex) //!< Event index for profiling
|
||||
: gpuDevice_(device),
|
||||
palRef_(palRef) {
|
||||
: gpuDevice_(device), palRef_(palRef) {
|
||||
info_.blockIndex_ = blockIndex;
|
||||
info_.counterIndex_ = counterIndex;
|
||||
info_.eventIndex_ = eventIndex;
|
||||
|
||||
@@ -98,10 +98,10 @@ struct HwDebugWaveAddr {
|
||||
};
|
||||
|
||||
/*! \brief Kernel code information
|
||||
*
|
||||
* This structure contains the pointer of mapped kernel code for host access
|
||||
* and its size (in bytes)
|
||||
*/
|
||||
*
|
||||
* This structure contains the pointer of mapped kernel code for host access
|
||||
* and its size (in bytes)
|
||||
*/
|
||||
struct AqlCodeInfo {
|
||||
amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access
|
||||
uint32_t aqlCodeSize_; //! size of AQL code
|
||||
|
||||
@@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() {
|
||||
|
||||
void GpuDebugManager::flushCache(uint32_t mask) {
|
||||
HwDbgGpuCacheMask cacheMask(mask);
|
||||
//device()->xferQueue()->flushCuCaches(cacheMask);
|
||||
// device()->xferQueue()->flushCuCaches(cacheMask);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -47,9 +47,9 @@ struct GpuEvent {
|
||||
static constexpr uint32_t InvalidID = ((1 << 30) - 1);
|
||||
|
||||
struct {
|
||||
uint32_t id_ : 30; ///< Actual event id
|
||||
uint32_t modified_ : 1; ///< Resource associated with the event was modified
|
||||
uint32_t engineId_ : 1; ///< Type of the id
|
||||
uint32_t id_ : 30; ///< Actual event id
|
||||
uint32_t modified_ : 1; ///< Resource associated with the event was modified
|
||||
uint32_t engineId_ : 1; ///< Type of the id
|
||||
};
|
||||
//! GPU event default constructor
|
||||
GpuEvent() : id_(InvalidID), modified_(false), engineId_(MainEngine) {}
|
||||
@@ -63,8 +63,11 @@ struct GpuEvent {
|
||||
void invalidate() { id_ = InvalidID; }
|
||||
|
||||
// Overwrite default assign operator to preserve modified_ field
|
||||
GpuEvent& operator=(const GpuEvent& evt)
|
||||
{ id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
|
||||
GpuEvent& operator=(const GpuEvent& evt) {
|
||||
id_ = evt.id_;
|
||||
engineId_ = evt.engineId_;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \addtogroup PAL
|
||||
@@ -113,87 +116,110 @@ const static uint HsaSamplerObjectAlignment = 16;
|
||||
const static uint DeviceQueueMaskSize = 32;
|
||||
|
||||
struct AMDDeviceInfo {
|
||||
const char* targetName_; //!< Target name
|
||||
const char* machineTarget_; //!< Machine target
|
||||
const char* machineTargetLC_;//!< Machine target for LC
|
||||
uint simdPerCU_; //!< Number of SIMDs per CU
|
||||
uint simdWidth_; //!< Number of workitems processed per SIMD
|
||||
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
|
||||
uint memChannelBankWidth_; //!< Memory channel bank width
|
||||
uint localMemSizePerCU_; //!< Local memory size per CU
|
||||
uint localMemBanks_; //!< Number of banks of local memory
|
||||
uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
|
||||
uint gfxipVersion_; //!< The core engine GFXIP version
|
||||
bool xnackEnabled_; //!< Enable XNACK feature
|
||||
const char* targetName_; //!< Target name
|
||||
const char* machineTarget_; //!< Machine target
|
||||
const char* machineTargetLC_; //!< Machine target for LC
|
||||
uint simdPerCU_; //!< Number of SIMDs per CU
|
||||
uint simdWidth_; //!< Number of workitems processed per SIMD
|
||||
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
|
||||
uint memChannelBankWidth_; //!< Memory channel bank width
|
||||
uint localMemSizePerCU_; //!< Local memory size per CU
|
||||
uint localMemBanks_; //!< Number of banks of local memory
|
||||
uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
|
||||
uint gfxipVersion_; //!< The core engine GFXIP version
|
||||
bool xnackEnabled_; //!< Enable XNACK feature
|
||||
};
|
||||
|
||||
static const AMDDeviceInfo DeviceInfo[] = {
|
||||
/* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
|
||||
/* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
|
||||
/* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
|
||||
/* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
|
||||
/* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
/* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
|
||||
|
||||
/* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
|
||||
/* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
|
||||
/* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
|
||||
/* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
|
||||
/* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
|
||||
/* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
|
||||
/* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
|
||||
/* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
/* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
|
||||
|
||||
/* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801,false},
|
||||
/* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
|
||||
/* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
|
||||
/* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
|
||||
/* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
|
||||
/* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
|
||||
|
||||
/* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
|
||||
/* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
|
||||
/* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Ellesmere */ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
|
||||
/* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
|
||||
/* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Ellesmere */
|
||||
{"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
/* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
|
||||
};
|
||||
|
||||
// Ordering as per AsicRevision# in //depot/stg/pal/inc/core/palDevice.h and
|
||||
// http://confluence.amd.com/pages/viewpage.action?spaceKey=ASLC&title=AMDGPU+Target+Names
|
||||
static const AMDDeviceInfo Gfx9PlusSubDeviceInfo[] = {
|
||||
/* Vega10 */{"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
|
||||
/* Vega10 XNACK */{ LIGHTNING_SWITCH("gfx900","gfx901"), "gfx901", "gfx900",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 900, 901, true},
|
||||
/* Vega12 */{"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
|
||||
/* Vega12 XNACK */{ LIGHTNING_SWITCH("gfx904","gfx905"), "gfx905", "gfx904",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 904, 905, true},
|
||||
/* Vega20 */{"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
|
||||
/* Vega20 XNACK */{ LIGHTNING_SWITCH("gfx906","gfx907"), "gfx907", "gfx906",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 906, 907, true},
|
||||
/* Raven */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Raven XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
|
||||
/* Raven2 */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Raven2 XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
|
||||
/* Renoir */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Renoir XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
|
||||
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
|
||||
/* Navi10_A0 */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false },
|
||||
/* Navi10_A0 XNACK */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true },
|
||||
/* Navi10 */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
|
||||
/* Navi10 XNACK */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
|
||||
/* Navi10Lite */{"gfx1000", "gfx1000","gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
|
||||
/* Navi10Lite XNACK */{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
|
||||
/* Navi12 */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
|
||||
/* Navi12 XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
|
||||
/* Navi12Lite */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
|
||||
/* Navi12Lite XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
|
||||
/* Navi14 */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false },
|
||||
/* Navi14 XNACK */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true },
|
||||
/* UnknownDevice3 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
|
||||
/* UnknownDevice3 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
|
||||
/* UnknownDevice2 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
|
||||
/* UnknownDevice2 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
|
||||
/* Vega10 */ {"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
|
||||
/* Vega10 XNACK */
|
||||
{LIGHTNING_SWITCH("gfx900", "gfx901"), "gfx901", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 901,
|
||||
true},
|
||||
/* Vega12 */ {"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
|
||||
/* Vega12 XNACK */
|
||||
{LIGHTNING_SWITCH("gfx904", "gfx905"), "gfx905", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 905,
|
||||
true},
|
||||
/* Vega20 */ {"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
|
||||
/* Vega20 XNACK */
|
||||
{LIGHTNING_SWITCH("gfx906", "gfx907"), "gfx907", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 907,
|
||||
true},
|
||||
/* Raven */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Raven XNACK */
|
||||
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
|
||||
true},
|
||||
/* Raven2 */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Raven2 XNACK */
|
||||
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
|
||||
true},
|
||||
/* Renoir */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
|
||||
/* Renoir XNACK */
|
||||
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
|
||||
true},
|
||||
/* Navi10_A0 */
|
||||
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
|
||||
/* Navi10_A0 XNACK */
|
||||
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
|
||||
/* Navi10 */
|
||||
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
|
||||
/* Navi10 XNACK */
|
||||
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
|
||||
/* Navi10Lite */
|
||||
{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
|
||||
/* Navi10Lite XNACK */
|
||||
{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
|
||||
/* Navi12 */
|
||||
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
|
||||
/* Navi12 XNACK */
|
||||
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
|
||||
/* Navi12Lite */
|
||||
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
|
||||
/* Navi12Lite XNACK */
|
||||
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
|
||||
/* Navi14 */
|
||||
{"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false},
|
||||
/* Navi14 XNACK */
|
||||
{"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true},
|
||||
/* UnknownDevice3 */
|
||||
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
|
||||
/* UnknownDevice3 XNACK */
|
||||
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
|
||||
/* UnknownDevice2 */
|
||||
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
|
||||
/* UnknownDevice2 XNACK */
|
||||
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -53,15 +53,14 @@ void PalDeviceUnload() { pal::Device::tearDown(); }
|
||||
|
||||
namespace pal {
|
||||
|
||||
Util::GenericAllocator NullDevice::allocator_;
|
||||
Util::GenericAllocator NullDevice::allocator_;
|
||||
char* Device::platformObj_;
|
||||
Pal::IPlatform* Device::platform_;
|
||||
Pal::IPlatform* Device::platform_;
|
||||
|
||||
NullDevice::Compiler* NullDevice::compiler_;
|
||||
AppProfile Device::appProfile_;
|
||||
|
||||
NullDevice::NullDevice()
|
||||
: amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
|
||||
NullDevice::NullDevice() : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
|
||||
|
||||
bool NullDevice::init() {
|
||||
std::vector<Device*> devices;
|
||||
@@ -89,8 +88,8 @@ bool NullDevice::init() {
|
||||
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
|
||||
if (driverVersion.find("PAL") != std::string::npos) {
|
||||
if (static_cast<NullDevice*>(devices[i])->asicRevision() == revision) {
|
||||
foundActive = true;
|
||||
break;
|
||||
foundActive = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -109,132 +108,130 @@ bool NullDevice::init() {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
|
||||
// Loop through all supported devices and create each of them
|
||||
for (uint id = 0;
|
||||
id < sizeof(Gfx9PlusSubDeviceInfo)/sizeof(AMDDeviceInfo); ++id) {
|
||||
bool foundActive = false;
|
||||
bool foundDuplicate = false;
|
||||
uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
|
||||
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
|
||||
for (uint id = 0; id < sizeof(Gfx9PlusSubDeviceInfo) / sizeof(AMDDeviceInfo); ++id) {
|
||||
bool foundActive = false;
|
||||
bool foundDuplicate = false;
|
||||
uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
|
||||
: pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
|
||||
|
||||
if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
|
||||
continue;
|
||||
}
|
||||
if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Loop through all active PAL devices and see if we match one
|
||||
for (uint i = 0; i < devices.size(); ++i) {
|
||||
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
|
||||
if (driverVersion.find("PAL") != std::string::npos) {
|
||||
gfxipVersion = devices[i]->settings().useLightning_ ?
|
||||
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
|
||||
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
|
||||
uint gfxIpCurrent = devices[i]->settings().useLightning_ ?
|
||||
static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_ :
|
||||
static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
|
||||
if (gfxIpCurrent == gfxipVersion) {
|
||||
foundActive = true;
|
||||
break;
|
||||
}
|
||||
// Loop through all active PAL devices and see if we match one
|
||||
for (uint i = 0; i < devices.size(); ++i) {
|
||||
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
|
||||
if (driverVersion.find("PAL") != std::string::npos) {
|
||||
gfxipVersion = devices[i]->settings().useLightning_
|
||||
? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
|
||||
: pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
|
||||
uint gfxIpCurrent = devices[i]->settings().useLightning_
|
||||
? static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_
|
||||
: static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
|
||||
if (gfxIpCurrent == gfxipVersion) {
|
||||
foundActive = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Don't report an offline device if it's active
|
||||
if (foundActive) {
|
||||
continue;
|
||||
// Don't report an offline device if it's active
|
||||
if (foundActive) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
|
||||
// and compare them with the current entry to see if the current entry
|
||||
// was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
|
||||
// means the current entry already has been added in the offline device list
|
||||
for (uint j = 0; j < id; ++j) {
|
||||
if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
|
||||
// and compare them with the current entry to see if the current entry
|
||||
// was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
|
||||
// means the current entry already has been added in the offline device list
|
||||
for (uint j = 0; j < id; ++j) {
|
||||
if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
|
||||
continue;
|
||||
}
|
||||
if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
|
||||
pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
|
||||
foundDuplicate = true;
|
||||
break;
|
||||
}
|
||||
if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
|
||||
pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
|
||||
foundDuplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Don't report an offline device twice
|
||||
if (foundDuplicate) {
|
||||
continue;
|
||||
}
|
||||
// Don't report an offline device twice
|
||||
if (foundDuplicate) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
|
||||
uint ipLevelMajor = round(gfxipVersion / 100);
|
||||
uint ipLevelMinor = round(gfxipVersion / 10 % 10);
|
||||
switch (ipLevelMajor) {
|
||||
Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
|
||||
uint ipLevelMajor = round(gfxipVersion / 100);
|
||||
uint ipLevelMinor = round(gfxipVersion / 10 % 10);
|
||||
switch (ipLevelMajor) {
|
||||
case 9:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp9;
|
||||
break;
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp9;
|
||||
break;
|
||||
case 10:
|
||||
switch (ipLevelMinor) {
|
||||
case 0:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10;
|
||||
break;
|
||||
case 1:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_1;
|
||||
break;
|
||||
case 2:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_2;
|
||||
break;
|
||||
case 3:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_3;
|
||||
break;
|
||||
case 0:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10;
|
||||
break;
|
||||
case 1:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_1;
|
||||
break;
|
||||
case 2:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_2;
|
||||
break;
|
||||
case 3:
|
||||
ipLevel = Pal::GfxIpLevel::GfxIp10_3;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
|
||||
uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
|
||||
Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
|
||||
uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
|
||||
|
||||
switch (gfxipVersion) {
|
||||
switch (gfxipVersion) {
|
||||
case 901:
|
||||
case 900:
|
||||
revision = Pal::AsicRevision::Vega10;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Vega10;
|
||||
break;
|
||||
case 903:
|
||||
case 902:
|
||||
revision = Pal::AsicRevision::Raven;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Raven;
|
||||
break;
|
||||
case 905:
|
||||
case 904:
|
||||
revision = Pal::AsicRevision::Vega12;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Vega12;
|
||||
break;
|
||||
case 907:
|
||||
case 906:
|
||||
revision = Pal::AsicRevision::Vega20;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Vega20;
|
||||
break;
|
||||
case 1000:
|
||||
revision = Pal::AsicRevision::Navi10Lite;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Navi10Lite;
|
||||
break;
|
||||
case 1010:
|
||||
revision = Pal::AsicRevision::Navi10;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Navi10;
|
||||
break;
|
||||
case 1011:
|
||||
revision = Pal::AsicRevision::Navi12;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Navi12;
|
||||
break;
|
||||
case 1012:
|
||||
revision = Pal::AsicRevision::Navi14;
|
||||
break;
|
||||
revision = Pal::AsicRevision::Navi14;
|
||||
break;
|
||||
case 1030:
|
||||
ShouldNotReachHere();
|
||||
break;
|
||||
}
|
||||
ShouldNotReachHere();
|
||||
break;
|
||||
}
|
||||
|
||||
NullDevice* dev = new NullDevice();
|
||||
if (nullptr != dev) {
|
||||
if (!dev->create(revision, ipLevel, xNACKSupported)) {
|
||||
delete dev;
|
||||
}
|
||||
else {
|
||||
dev->registerDevice();
|
||||
}
|
||||
NullDevice* dev = new NullDevice();
|
||||
if (nullptr != dev) {
|
||||
if (!dev->create(revision, ipLevel, xNACKSupported)) {
|
||||
delete dev;
|
||||
} else {
|
||||
dev->registerDevice();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -257,10 +254,10 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
|
||||
if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) {
|
||||
hwInfo_ = &DeviceInfo[static_cast<uint>(asicRevision)];
|
||||
} else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) {
|
||||
subtarget = (static_cast<uint>(asicRevision_) %
|
||||
static_cast<uint>(Pal::AsicRevision::Vega10))
|
||||
<< 1 | xNACKSupported;
|
||||
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
|
||||
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
|
||||
<< 1 |
|
||||
xNACKSupported;
|
||||
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
|
||||
|
||||
} else {
|
||||
return false;
|
||||
@@ -271,8 +268,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
|
||||
|
||||
// Report 512MB for all offline devices
|
||||
Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
|
||||
heaps[Pal::GpuHeapLocal].heapSize =
|
||||
heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
|
||||
heaps[Pal::GpuHeapLocal].heapSize = heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
|
||||
|
||||
Pal::WorkStationCaps wscaps = {};
|
||||
|
||||
@@ -295,7 +291,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
|
||||
info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64;
|
||||
|
||||
if (settings().useLightning_) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
|
||||
// create compilation object with cache support
|
||||
int gfxipMajor = hwInfo_->gfxipVersionLC_ / 100;
|
||||
int gfxipMinor = hwInfo_->gfxipVersionLC_ / 10 % 10;
|
||||
@@ -323,16 +319,16 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
|
||||
cacheCompilation_.reset(compObj);
|
||||
#endif
|
||||
} else {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
const char* library = getenv("HSA_COMPILER_LIBRARY");
|
||||
aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
|
||||
library,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB };
|
||||
aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
|
||||
library,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB};
|
||||
// Initialize the compiler handle
|
||||
acl_error error;
|
||||
compiler_ = aclCompilerInit(&opts, &error);
|
||||
@@ -370,9 +366,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
|
||||
info_.maxWorkItemDimensions_ = 3;
|
||||
|
||||
info_.maxComputeUnits_ = settings().enableWgpMode_ ?
|
||||
palProp.gfxipProperties.shaderCore.numAvailableCus / 2 :
|
||||
palProp.gfxipProperties.shaderCore.numAvailableCus;
|
||||
info_.maxComputeUnits_ = settings().enableWgpMode_
|
||||
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
|
||||
: palProp.gfxipProperties.shaderCore.numAvailableCus;
|
||||
|
||||
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
|
||||
|
||||
@@ -427,7 +423,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
if (GPU_ADD_HBCC_SIZE) {
|
||||
localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize;
|
||||
} else {
|
||||
localRAM = heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
|
||||
localRAM =
|
||||
heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
|
||||
}
|
||||
|
||||
info_.globalMemSize_ = (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
@@ -445,10 +442,10 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
// Find the largest heap form FB memory
|
||||
if (GPU_ADD_HBCC_SIZE) {
|
||||
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize),
|
||||
cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
|
||||
cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
|
||||
} else {
|
||||
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].physicalHeapSize),
|
||||
cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
|
||||
cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
|
||||
}
|
||||
|
||||
#if defined(ATI_OS_WIN)
|
||||
@@ -561,7 +558,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
|
||||
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
|
||||
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)",
|
||||
settings().useLightning_ ? ",LC" : ",HSAIL");
|
||||
settings().useLightning_ ? ",LC" : ",HSAIL");
|
||||
|
||||
info_.profile_ = "FULL_PROFILE";
|
||||
if (settings().oclVersion_ >= OpenCL20) {
|
||||
@@ -640,15 +637,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
|
||||
info_.simdWidth_ = hwInfo()->simdWidth_;
|
||||
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
|
||||
info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32:
|
||||
palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
|
||||
info_.wavefrontWidth_ =
|
||||
settings().enableWave32Mode_ ? 32 : palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
|
||||
info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs;
|
||||
|
||||
info_.globalMemChannelBanks_ = 4;
|
||||
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
|
||||
info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
|
||||
info_.localMemBanks_ = hwInfo()->localMemBanks_;
|
||||
info_.gfxipVersion_ = settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
|
||||
info_.gfxipVersion_ =
|
||||
settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
|
||||
|
||||
info_.timeStampFrequency_ = 1000000;
|
||||
info_.numAsyncQueues_ = numComputeRings;
|
||||
@@ -661,7 +659,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.pcieDeviceId_ = palProp.deviceId;
|
||||
info_.pcieRevisionId_ = palProp.revisionId;
|
||||
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * hwInfo()->simdPerCU_ *
|
||||
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
|
||||
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -789,8 +787,7 @@ Device::Device()
|
||||
globalScratchBuf_(nullptr),
|
||||
srdManager_(nullptr),
|
||||
resourceList_(nullptr),
|
||||
rgpCaptureMgr_(nullptr)
|
||||
{}
|
||||
rgpCaptureMgr_(nullptr) {}
|
||||
|
||||
Device::~Device() {
|
||||
// remove the HW debug manager
|
||||
@@ -803,8 +800,8 @@ Device::~Device() {
|
||||
}
|
||||
|
||||
if (glb_ctx_ != nullptr) {
|
||||
glb_ctx_->release();
|
||||
glb_ctx_ = nullptr;
|
||||
glb_ctx_->release();
|
||||
glb_ctx_ = nullptr;
|
||||
}
|
||||
|
||||
delete srdManager_;
|
||||
@@ -878,19 +875,21 @@ bool Device::create(Pal::IDevice* device) {
|
||||
ipLevel_ = properties().gfxLevel;
|
||||
asicRevision_ = properties().revision;
|
||||
|
||||
// XNACK flag should be set for PageMigration | IOMMUv2 Support
|
||||
uint isXNACKSupported = static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled
|
||||
|| properties_.gpuMemoryProperties.flags.iommuv2Support);
|
||||
// XNACK flag should be set for PageMigration | IOMMUv2 Support
|
||||
uint isXNACKSupported =
|
||||
static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled ||
|
||||
properties_.gpuMemoryProperties.flags.iommuv2Support);
|
||||
uint subtarget = isXNACKSupported;
|
||||
|
||||
// Update HW info for the device
|
||||
if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Polaris12)) {
|
||||
hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
|
||||
} else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) {
|
||||
// For compiler sub targets
|
||||
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10)) << 1 |
|
||||
subtarget;
|
||||
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
|
||||
// For compiler sub targets
|
||||
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
|
||||
<< 1 |
|
||||
subtarget;
|
||||
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
@@ -995,7 +994,7 @@ bool Device::create(Pal::IDevice* device) {
|
||||
}
|
||||
|
||||
if (settings().useLightning_) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
|
||||
// create compilation object with cache support
|
||||
int gfxipMajor = hwInfo()->gfxipVersionLC_ / 100;
|
||||
int gfxipMinor = hwInfo()->gfxipVersionLC_ / 10 % 10;
|
||||
@@ -1013,7 +1012,7 @@ bool Device::create(Pal::IDevice* device) {
|
||||
}
|
||||
|
||||
amd::CacheCompilation* compObj = new amd::CacheCompilation(
|
||||
cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
|
||||
cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
|
||||
if (!compObj) {
|
||||
LogError("Unable to create cache compilation object!");
|
||||
return false;
|
||||
@@ -1021,18 +1020,17 @@ bool Device::create(Pal::IDevice* device) {
|
||||
|
||||
cacheCompilation_.reset(compObj);
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
} else {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
const char* library = getenv("HSA_COMPILER_LIBRARY");
|
||||
aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
|
||||
library,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB };
|
||||
aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
|
||||
library,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
AMD_OCL_SC_LIB};
|
||||
// Initialize the compiler handle
|
||||
acl_error error;
|
||||
compiler_ = aclCompilerInit(&opts, &error);
|
||||
@@ -1056,7 +1054,7 @@ bool Device::create(Pal::IDevice* device) {
|
||||
|
||||
if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) {
|
||||
std::vector<amd::Device*> devices;
|
||||
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
|
||||
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
|
||||
// Add all PAL devices
|
||||
for (uint32_t i = gStartDevice; i < numDevices; ++i) {
|
||||
devices.push_back(amd::Device::devices()[i]);
|
||||
@@ -1070,8 +1068,8 @@ bool Device::create(Pal::IDevice* device) {
|
||||
if (glb_ctx_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
amd::Buffer* buf =
|
||||
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
|
||||
amd::Buffer* buf =
|
||||
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
|
||||
if ((buf != nullptr) && buf->create()) {
|
||||
p2p_stage_ = buf;
|
||||
} else {
|
||||
@@ -1086,11 +1084,8 @@ bool Device::create(Pal::IDevice* device) {
|
||||
|
||||
// =====================================================================================================================
|
||||
// Master function that handles developer callbacks from PAL.
|
||||
void PAL_STDCALL Device::PalDeveloperCallback(
|
||||
void* pPrivateData,
|
||||
const Pal::uint32 deviceIndex,
|
||||
Pal::Developer::CallbackType type,
|
||||
void* pCbData) {
|
||||
void PAL_STDCALL Device::PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
|
||||
Pal::Developer::CallbackType type, void* pCbData) {
|
||||
Device* device = static_cast<Device*>(pPrivateData);
|
||||
const auto& barrier = *static_cast<const Pal::Developer::BarrierData*>(pCbData);
|
||||
|
||||
@@ -1099,7 +1094,7 @@ void PAL_STDCALL Device::PalDeveloperCallback(
|
||||
VirtualGPU* gpu = nullptr;
|
||||
if (pBarrierData->pCmdBuffer != nullptr) {
|
||||
// Find which queue the current command buffer belongs
|
||||
for (const auto& it: device->vgpus()) {
|
||||
for (const auto& it : device->vgpus()) {
|
||||
if (it->isActiveCmd(pBarrierData->pCmdBuffer)) {
|
||||
gpu = it;
|
||||
break;
|
||||
@@ -1112,18 +1107,18 @@ void PAL_STDCALL Device::PalDeveloperCallback(
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case Pal::Developer::CallbackType::BarrierBegin:
|
||||
device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
|
||||
break;
|
||||
case Pal::Developer::CallbackType::BarrierEnd:
|
||||
device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
|
||||
break;
|
||||
case Pal::Developer::CallbackType::ImageBarrier:
|
||||
assert(false);
|
||||
break;
|
||||
case Pal::Developer::CallbackType::DrawDispatch:
|
||||
case Pal::Developer::CallbackType::BarrierBegin:
|
||||
device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
|
||||
break;
|
||||
default:
|
||||
case Pal::Developer::CallbackType::BarrierEnd:
|
||||
device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
|
||||
break;
|
||||
case Pal::Developer::CallbackType::ImageBarrier:
|
||||
assert(false);
|
||||
break;
|
||||
case Pal::Developer::CallbackType::DrawDispatch:
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1136,15 +1131,16 @@ bool Device::initializeHeapResources() {
|
||||
// Request all compute engines
|
||||
finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines =
|
||||
((1 << numComputeEngines_) - 1);
|
||||
for (const auto& it: exclusiveComputeEnginesId_) {
|
||||
for (const auto& it : exclusiveComputeEnginesId_) {
|
||||
// Request real time compute engines
|
||||
finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= (1 << it.second);
|
||||
finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |=
|
||||
(1 << it.second);
|
||||
}
|
||||
// Request all SDMA engines
|
||||
finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1;
|
||||
|
||||
if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
heapInitComplete_ = true;
|
||||
@@ -1201,7 +1197,8 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
|
||||
if (queue != nullptr) {
|
||||
profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
|
||||
if (queue->asHostQueue() != nullptr) {
|
||||
bool interopQueue = (0 != (queue->context().info().flags_ &
|
||||
bool interopQueue = (0 !=
|
||||
(queue->context().info().flags_ &
|
||||
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
|
||||
amd::Context::D3D11DeviceKhr)));
|
||||
rtCUs = queue->rtCUs();
|
||||
@@ -1233,8 +1230,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
|
||||
device::Program* program;
|
||||
if (settings().useLightning_) {
|
||||
program = new LightningProgram(*this);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
program = new HSAILProgram(*this);
|
||||
}
|
||||
if (program == nullptr) {
|
||||
@@ -1249,9 +1245,7 @@ typedef std::unordered_map<int, bool> requestedDevices_t;
|
||||
|
||||
//! Parses the requested list of devices to be exposed to the user.
|
||||
static void parseRequestedDeviceList(const char* requestedDeviceList,
|
||||
requestedDevices_t& requestedDevices,
|
||||
uint32_t numDevices) {
|
||||
|
||||
requestedDevices_t& requestedDevices, uint32_t numDevices) {
|
||||
char* pch = strtok(const_cast<char*>(requestedDeviceList), ",");
|
||||
while (pch != nullptr) {
|
||||
bool deviceIdValid = true;
|
||||
@@ -1263,8 +1257,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList,
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (currentDeviceIndex < 0 ||
|
||||
static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
|
||||
if (currentDeviceIndex < 0 || static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
|
||||
deviceIdValid = false;
|
||||
}
|
||||
// Get next token.
|
||||
@@ -1310,9 +1303,9 @@ bool Device::init() {
|
||||
// Count up all the devices in the system.
|
||||
platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);
|
||||
|
||||
const char* requestedDeviceList = amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ?
|
||||
HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
const char* requestedDeviceList = amd::IS_HIP
|
||||
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
|
||||
if (requestedDeviceList[0] != '\0') {
|
||||
useDeviceList = true;
|
||||
@@ -1465,8 +1458,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
|
||||
if (result) {
|
||||
// Disallow permanent map for Win7 only, since OS will move buffer to sysmem
|
||||
if (IS_LINUX ||
|
||||
// Or Win10
|
||||
(properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
|
||||
// Or Win10
|
||||
(properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
|
||||
void* address = gpuMemory->map(nullptr);
|
||||
CondLog(address == nullptr, "PAL failed lock of persistent memory!");
|
||||
}
|
||||
@@ -1697,9 +1690,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
(memory->memoryType() != Resource::ExternalPhysical) &&
|
||||
((owner.getHostMem() != nullptr) ||
|
||||
((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
|
||||
bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
|
||||
? owner.getHostMemRef()->size()
|
||||
: owner.getSize());
|
||||
bool ok = memory->pinSystemMemory(
|
||||
owner.getHostMem(),
|
||||
(owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
|
||||
//! \note: Ignore the pinning result for now
|
||||
}
|
||||
|
||||
@@ -1720,9 +1713,9 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
|
||||
device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const {
|
||||
assert((owner.asImage() != nullptr) && "View supports images only");
|
||||
const amd::Image& image = *owner.asImage();
|
||||
pal::Memory* gpuImage = new pal::Image(
|
||||
*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
|
||||
image.getImageFormat(), image.getType(), image.getMipLevels());
|
||||
pal::Memory* gpuImage =
|
||||
new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
|
||||
image.getImageFormat(), image.getType(), image.getMipLevels());
|
||||
|
||||
// Create resource
|
||||
if (nullptr != gpuImage) {
|
||||
@@ -1827,19 +1820,18 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
|
||||
Pal::gpusize invisible = allocedMem[Pal::GpuHeapInvisible] - resourceCache().lclCacheSize();
|
||||
|
||||
// Fill free memory info
|
||||
freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
|
||||
(local + invisible)) / Ki);
|
||||
freeMemory[TotalFreeMemory] =
|
||||
static_cast<size_t>((info().globalMemSize_ - (local + invisible)) / Ki);
|
||||
if (invisible >= heaps_[Pal::GpuHeapInvisible].heapSize) {
|
||||
invisible = 0;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
invisible = heaps_[Pal::GpuHeapInvisible].heapSize - invisible;
|
||||
}
|
||||
freeMemory[LargestFreeBlock] = static_cast<size_t>(invisible) / Ki;
|
||||
|
||||
if (settings().apuSystem_) {
|
||||
Pal::gpusize sysMem = allocedMem[Pal::GpuHeapGartCacheable] + allocedMem[Pal::GpuHeapGartUswc] -
|
||||
resourceCache().cacheSize() + resourceCache().lclCacheSize();
|
||||
resourceCache().cacheSize() + resourceCache().lclCacheSize();
|
||||
sysMem /= Ki;
|
||||
if (sysMem >= freeMemory[TotalFreeMemory]) {
|
||||
freeMemory[TotalFreeMemory] = 0;
|
||||
@@ -1945,8 +1937,7 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
|
||||
amd::ScopedLock lk(scratchAlloc_);
|
||||
uint sb = vgpu->hwRing();
|
||||
static const uint WaveSizeLimit = ((1 << 21) - 256);
|
||||
const uint threadSizeLimit =
|
||||
WaveSizeLimit / info().wavefrontWidth_;
|
||||
const uint threadSizeLimit = WaveSizeLimit / info().wavefrontWidth_;
|
||||
if (regNum > threadSizeLimit) {
|
||||
LogError("Requested private memory is bigger than HW supports!");
|
||||
regNum = threadSizeLimit;
|
||||
@@ -1968,9 +1959,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
|
||||
// Calculate the size of the scratch buffer for a queue
|
||||
uint32_t numTotalCUs = info().maxComputeUnits_;
|
||||
uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs;
|
||||
scratchBuf->size_ =
|
||||
static_cast<uint64_t>(info().wavefrontWidth_) *
|
||||
scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
|
||||
scratchBuf->size_ = static_cast<uint64_t>(info().wavefrontWidth_) * scratchBuf->regNum_ *
|
||||
numMaxWaves * sizeof(uint32_t);
|
||||
scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_);
|
||||
scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi));
|
||||
// Note: Generic address space setup in HW requires 64KB alignment for scratch
|
||||
@@ -2280,7 +2270,7 @@ void Device::SrdManager::freeSrdSlot(uint64_t addr) {
|
||||
void Device::updateAllocedMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) const {
|
||||
if (free) {
|
||||
allocedMem[heap] -= size;
|
||||
} else {
|
||||
} else {
|
||||
allocedMem[heap] += size;
|
||||
}
|
||||
}
|
||||
@@ -2337,12 +2327,18 @@ cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorag
|
||||
return status;
|
||||
}
|
||||
|
||||
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
||||
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
bool result = false;
|
||||
Pal::SetClockModeInput setClockMode = {};
|
||||
Pal::DeviceClockMode palClockMode = static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
|
||||
Pal::DeviceClockMode palClockMode =
|
||||
static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
|
||||
setClockMode.clockMode = palClockMode;
|
||||
result = (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))? true : false;
|
||||
result = (Pal::Result::Success ==
|
||||
(iDev()->SetClockMode(setClockMode,
|
||||
reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))
|
||||
? true
|
||||
: false;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ class NullDevice : public amd::Device {
|
||||
bool create(Pal::AsicRevision asicRevision, //!< GPU ASIC revision
|
||||
Pal::GfxIpLevel ipLevel, //!< GPU ip level
|
||||
uint xNACKSupported = 0 //!< GPU xNACKSupported
|
||||
);
|
||||
);
|
||||
|
||||
//! Instantiate a new virtual device
|
||||
virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) {
|
||||
@@ -111,11 +111,14 @@ class NullDevice : public amd::Device {
|
||||
virtual void svmFree(void* ptr) const { return; }
|
||||
|
||||
void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
|
||||
void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
|
||||
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
|
||||
void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
|
||||
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
||||
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
|
||||
static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
|
||||
|
||||
Pal::AsicRevision asicRevision_; //!< ASIC revision
|
||||
Pal::GfxIpLevel ipLevel_; //!< Device IP level
|
||||
@@ -127,7 +130,7 @@ class NullDevice : public amd::Device {
|
||||
size_t maxTextureSize, //!< Maximum texture size supported in HW
|
||||
uint numComputeRings, //!< Number of compute rings
|
||||
uint numExclusiveComputeRings //!< Number of exclusive compute rings
|
||||
);
|
||||
);
|
||||
};
|
||||
|
||||
//! Forward declarations
|
||||
@@ -148,26 +151,22 @@ class ThreadTrace;
|
||||
#ifndef CL_FILTER_NONE
|
||||
#define CL_FILTER_NONE 0x1142
|
||||
#endif
|
||||
enum class ExclusiveQueueType : uint32_t {
|
||||
RealTime0 = 0,
|
||||
RealTime1,
|
||||
Medium
|
||||
};
|
||||
enum class ExclusiveQueueType : uint32_t { RealTime0 = 0, RealTime1, Medium };
|
||||
class Sampler : public device::Sampler {
|
||||
public:
|
||||
//! Constructor
|
||||
Sampler(const Device& dev) : dev_(dev) {}
|
||||
Sampler(const Device& dev) : dev_(dev) {}
|
||||
|
||||
//! Default destructor for the device memory object
|
||||
virtual ~Sampler();
|
||||
|
||||
//! Creates a device sampler from the OCL sampler state
|
||||
bool create(uint32_t oclSamplerState //!< OCL sampler state
|
||||
);
|
||||
);
|
||||
|
||||
//! Creates a device sampler from the OCL sampler state
|
||||
bool create(const amd::Sampler& owner //!< AMD sampler object
|
||||
);
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
@@ -216,7 +215,7 @@ class Device : public NullDevice {
|
||||
//! Releases transfer buffer
|
||||
void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer
|
||||
Memory& buffer //!< Transfer buffer for release
|
||||
);
|
||||
);
|
||||
|
||||
//! Returns the buffer's size for transfer
|
||||
size_t bufSize() const { return bufSize_; }
|
||||
@@ -308,7 +307,7 @@ class Device : public NullDevice {
|
||||
//! Initialise a device (i.e. all parts of the constructor that could
|
||||
//! potentially fail)
|
||||
bool create(Pal::IDevice* device //!< PAL device interface object
|
||||
);
|
||||
);
|
||||
|
||||
//! Destructor for the physical GPU device
|
||||
virtual ~Device();
|
||||
@@ -346,7 +345,8 @@ class Device : public NullDevice {
|
||||
virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const device::VirtualDevice* vdev);
|
||||
|
||||
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
|
||||
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
||||
cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
|
||||
|
||||
//! Retrieves information about free memory on a GPU device
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const;
|
||||
@@ -398,9 +398,10 @@ class Device : public NullDevice {
|
||||
//! Returns the number of available compute rings
|
||||
uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size(); }
|
||||
|
||||
//! Returns the map of available exclusive compute rings with the engine index
|
||||
const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const
|
||||
{ return exclusiveComputeEnginesId_; }
|
||||
//! Returns the map of available exclusive compute rings with the engine index
|
||||
const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const {
|
||||
return exclusiveComputeEnginesId_;
|
||||
}
|
||||
|
||||
//! Returns the number of available DMA engines
|
||||
uint numDMAEngines() const { return numDmaEngines_; }
|
||||
@@ -526,11 +527,8 @@ class Device : public NullDevice {
|
||||
}
|
||||
|
||||
private:
|
||||
static void PAL_STDCALL PalDeveloperCallback(
|
||||
void* pPrivateData,
|
||||
const Pal::uint32 deviceIndex,
|
||||
Pal::Developer::CallbackType type,
|
||||
void* pCbData);
|
||||
static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
|
||||
Pal::Developer::CallbackType type, void* pCbData);
|
||||
|
||||
//! Disable copy constructor
|
||||
Device(const Device&);
|
||||
@@ -554,36 +552,37 @@ class Device : public NullDevice {
|
||||
//! Allocates/reallocates the scratch buffer, according to the usage
|
||||
bool allocScratch(uint regNum, //!< Number of the scratch registers
|
||||
const VirtualGPU* vgpu //!< Virtual GPU for the allocation
|
||||
);
|
||||
);
|
||||
|
||||
//! Interop for D3D devices
|
||||
bool associateD3D11Device(void* d3d11Device //!< void* is of type ID3D11Device*
|
||||
);
|
||||
);
|
||||
bool associateD3D10Device(void* d3d10Device //!< void* is of type ID3D10Device*
|
||||
);
|
||||
);
|
||||
bool associateD3D9Device(void* d3d9Device //!< void* is of type IDirect3DDevice9*
|
||||
);
|
||||
);
|
||||
//! Interop for GL device
|
||||
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
|
||||
static char* platformObj_; //!< Memory allocated for PAL platform object
|
||||
static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
|
||||
static char* platformObj_; //!< Memory allocated for PAL platform object
|
||||
static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
|
||||
|
||||
amd::Context* context_; //!< A dummy context for internal allocations
|
||||
mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
|
||||
amd::Context* context_; //!< A dummy context for internal allocations
|
||||
mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
|
||||
//! Lock to serialise all async ops on initialization heap operation
|
||||
mutable amd::Monitor lockForInitHeap_;
|
||||
mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
|
||||
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
||||
mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
|
||||
mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
|
||||
mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
|
||||
ResourceCache* resourceCache_; //!< Resource cache
|
||||
uint numComputeEngines_; //!< The number of available compute engines
|
||||
std::map<ExclusiveQueueType, uint32_t> exclusiveComputeEnginesId_;//!< The number of available compute engines
|
||||
mutable amd::Monitor lockForInitHeap_;
|
||||
mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
|
||||
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
||||
mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
|
||||
mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
|
||||
mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
|
||||
ResourceCache* resourceCache_; //!< Resource cache
|
||||
uint numComputeEngines_; //!< The number of available compute engines
|
||||
std::map<ExclusiveQueueType, uint32_t>
|
||||
exclusiveComputeEnginesId_; //!< The number of available compute engines
|
||||
uint numDmaEngines_; //!< The number of available compute engines
|
||||
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
|
||||
VirtualGPU* xferQueue_; //!< Transfer queue
|
||||
@@ -594,10 +593,13 @@ class Device : public NullDevice {
|
||||
mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem
|
||||
Pal::DeviceProperties properties_; //!< PAL device properties
|
||||
Pal::IDevice* device_; //!< PAL device object
|
||||
mutable std::atomic<Pal::gpusize> allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
|
||||
std::unordered_set<Resource*>* resourceList_; //!< Active resource list
|
||||
RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
|
||||
Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
|
||||
mutable std::atomic<Pal::gpusize>
|
||||
allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
|
||||
std::unordered_set<Resource*>* resourceList_; //!< Active resource list
|
||||
RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
|
||||
Pal::GpuMemoryHeapProperties
|
||||
heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -3,19 +3,19 @@
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool Device::associateD3D10Device(void* d3d10Device) { return false; }
|
||||
} // pal
|
||||
} // namespace pal
|
||||
#else // !ATI_OS_WIN
|
||||
|
||||
#include <D3D10_1.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
@@ -127,6 +127,6 @@ bool Device::associateD3D10Device(void* d3d10Device) {
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
} // namespace pal
|
||||
|
||||
#endif // !ATI_OS_WIN
|
||||
|
||||
@@ -3,19 +3,19 @@
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool Device::associateD3D11Device(void* d3d11Device) { return false; }
|
||||
}
|
||||
} // namespace pal
|
||||
#else // !ATI_OS_LINUX
|
||||
|
||||
#include <D3D11.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
@@ -128,6 +128,6 @@ bool Device::associateD3D11Device(void* d3d11Device) {
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
} // namespace pal
|
||||
|
||||
#endif // !ATI_OS_LINUX
|
||||
|
||||
@@ -3,20 +3,20 @@
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool Device::associateD3D9Device(void* d3dDevice) { return false; }
|
||||
}
|
||||
} // namespace pal
|
||||
#else // !ATI_OS_LINUX
|
||||
|
||||
#include <d3d9.h>
|
||||
#include <dxgi.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce
|
||||
*tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
|
||||
*classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
@@ -44,5 +44,5 @@ bool Device::associateD3D9Device(void* d3d9Device) {
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
} // namespace pal
|
||||
#endif // !ATI_OS_WIN
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -32,34 +32,27 @@
|
||||
#include "protocols/rgpServer.h"
|
||||
#include "protocols/driverControlServer.h"
|
||||
|
||||
namespace pal
|
||||
{
|
||||
namespace pal {
|
||||
// ================================================================================================
|
||||
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
|
||||
:
|
||||
device_(device),
|
||||
dev_driver_server_(platform->GetDevDriverServer()),
|
||||
user_event_(nullptr),
|
||||
num_prep_disp_(0),
|
||||
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
|
||||
trace_gpu_mem_limit_(0),
|
||||
global_disp_count_(1), // Must start from 1 according to RGP spec
|
||||
trace_enabled_(false),
|
||||
inst_tracing_enabled_(false)
|
||||
{
|
||||
: device_(device),
|
||||
dev_driver_server_(platform->GetDevDriverServer()),
|
||||
user_event_(nullptr),
|
||||
num_prep_disp_(0),
|
||||
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
|
||||
trace_gpu_mem_limit_(0),
|
||||
global_disp_count_(1), // Must start from 1 according to RGP spec
|
||||
trace_enabled_(false),
|
||||
inst_tracing_enabled_(false) {
|
||||
memset(&trace_, 0, sizeof(trace_));
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
RgpCaptureMgr::~RgpCaptureMgr()
|
||||
{
|
||||
DestroyRGPTracing();
|
||||
}
|
||||
RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); }
|
||||
|
||||
// ================================================================================================
|
||||
// Creates the GPU Open Developer Mode manager class.
|
||||
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
|
||||
{
|
||||
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) {
|
||||
RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);
|
||||
|
||||
if (mgr != nullptr && !mgr->Init(platform)) {
|
||||
@@ -71,8 +64,7 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
{
|
||||
bool RgpCaptureMgr::Init(Pal::IPlatform* platform) {
|
||||
if (dev_driver_server_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
@@ -105,13 +97,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
|
||||
const uint32_t api_version = settings.oclVersion_;
|
||||
|
||||
trace_.gpa_session_ = new GpuUtil::GpaSession(
|
||||
platform,
|
||||
device_.iDev(),
|
||||
api_version >> 4, // OCL API version major
|
||||
api_version & 0xf, // OCL API version minor
|
||||
RgpSqttInstrumentationSpecVersion,
|
||||
RgpSqttInstrumentationApiVersion);
|
||||
trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(),
|
||||
api_version >> 4, // OCL API version major
|
||||
api_version & 0xf, // OCL API version minor
|
||||
RgpSqttInstrumentationSpecVersion,
|
||||
RgpSqttInstrumentationApiVersion);
|
||||
|
||||
if (trace_.gpa_session_ == nullptr) {
|
||||
result = false;
|
||||
@@ -119,7 +109,7 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
}
|
||||
|
||||
// Initialize the GPA session
|
||||
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
|
||||
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
|
||||
result = false;
|
||||
}
|
||||
|
||||
@@ -133,9 +123,9 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
if (!result) {
|
||||
// If we've failed to initialize tracing, permanently disable traces
|
||||
if (rgp_server_ != nullptr) {
|
||||
rgp_server_->DisableTraces();
|
||||
rgp_server_->DisableTraces();
|
||||
|
||||
trace_enabled_ = false;
|
||||
trace_enabled_ = false;
|
||||
}
|
||||
|
||||
// Clean up if we failed
|
||||
@@ -150,9 +140,8 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
// ================================================================================================
|
||||
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
|
||||
// frames and initializes resources for them.
|
||||
bool RgpCaptureMgr::RegisterTimedQueue(
|
||||
uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
|
||||
{
|
||||
bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
|
||||
bool* debug_vmid) const {
|
||||
bool result = true;
|
||||
|
||||
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
|
||||
@@ -166,8 +155,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
|
||||
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
|
||||
|
||||
// Register the queue with the GPA session class for timed queue operation support.
|
||||
if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
|
||||
kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
|
||||
if (trace_.gpa_session_->RegisterTimedQueue(
|
||||
iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
|
||||
result = false;
|
||||
}
|
||||
|
||||
@@ -175,11 +164,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
|
||||
Pal::IQueue* queue,
|
||||
uint64_t cmdId,
|
||||
const Pal::SubmitInfo& submitInfo) const
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
|
||||
const Pal::SubmitInfo& submitInfo) const {
|
||||
// Fill in extra meta-data information to associate the API command buffer data with
|
||||
// the generated timing information.
|
||||
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
|
||||
@@ -205,8 +191,7 @@ Pal::Result RgpCaptureMgr::TimedQueueSubmit(
|
||||
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
|
||||
//
|
||||
// This finalizes the developer driver manager.
|
||||
void RgpCaptureMgr::Finalize()
|
||||
{
|
||||
void RgpCaptureMgr::Finalize() {
|
||||
// Figure out if the gfxip supports tracing. We decide tracing if there is at least one
|
||||
// enumerated GPU that can support tracing. Since we don't yet know if that GPU will be
|
||||
// picked as the target of an eventual VkDevice, this check is imperfect.
|
||||
@@ -215,8 +200,8 @@ void RgpCaptureMgr::Finalize()
|
||||
bool hw_support_tracing = false;
|
||||
|
||||
if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
|
||||
if (GpuSupportsTracing(device_.properties(), device_.settings())) {
|
||||
hw_support_tracing = true;
|
||||
if (GpuSupportsTracing(device_.properties(), device_.settings())) {
|
||||
hw_support_tracing = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -234,20 +219,18 @@ void RgpCaptureMgr::Finalize()
|
||||
|
||||
// ================================================================================================
|
||||
// Waits for the driver to be resumed if it's currently paused.
|
||||
void RgpCaptureMgr::WaitForDriverResume()
|
||||
{
|
||||
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
|
||||
void RgpCaptureMgr::WaitForDriverResume() {
|
||||
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
|
||||
|
||||
assert(pDriverControlServer != nullptr);
|
||||
assert(pDriverControlServer != nullptr);
|
||||
|
||||
pDriverControlServer->WaitForDriverResume();
|
||||
pDriverControlServer->WaitForDriverResume();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Called before a swap chain presents. This signals a frame-end boundary and
|
||||
// is used to coordinate RGP trace start/stop.
|
||||
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
|
||||
{
|
||||
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
|
||||
if (rgp_server_->TracesEnabled()) {
|
||||
// If there's currently a trace running, submit the trace-end command buffer
|
||||
if (trace_.status_ == TraceStatus::Running) {
|
||||
@@ -257,8 +240,7 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
|
||||
Pal::Result res = EndRGPHardwareTrace(gpu);
|
||||
if (Pal::Result::ErrorIncompatibleQueue == res) {
|
||||
// continue until we find the right queue...
|
||||
}
|
||||
else if (Pal::Result::Success == res) {
|
||||
} else if (Pal::Result::Success == res) {
|
||||
trace_.sqtt_disp_count_ = 0;
|
||||
} else {
|
||||
FinishRGPTrace(gpu, true);
|
||||
@@ -272,43 +254,42 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
|
||||
|
||||
// Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
|
||||
GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
|
||||
//Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
|
||||
//assert(result == Pal::Result::Success);
|
||||
// Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
|
||||
// assert(result == Pal::Result::Success);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Pal::Result RgpCaptureMgr::CheckForTraceResults()
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::CheckForTraceResults() {
|
||||
assert(trace_.status_ == TraceStatus::WaitingForResults);
|
||||
|
||||
Pal::Result result = Pal::Result::NotReady;
|
||||
|
||||
// Check if trace results are ready
|
||||
if (trace_.gpa_session_->IsReady() && // GPA session is ready
|
||||
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
|
||||
if (trace_.gpa_session_->IsReady() && // GPA session is ready
|
||||
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
|
||||
{
|
||||
bool success = false;
|
||||
|
||||
// Fetch required trace data size from GPA session
|
||||
size_t traceDataSize = 0;
|
||||
void* pTraceData = nullptr;
|
||||
void* pTraceData = nullptr;
|
||||
|
||||
trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);
|
||||
|
||||
// Allocate memory for trace data
|
||||
if (traceDataSize > 0) {
|
||||
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
|
||||
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
|
||||
}
|
||||
|
||||
if (pTraceData != nullptr) {
|
||||
// Get trace data from GPA session
|
||||
if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
|
||||
Pal::Result::Success) {
|
||||
Pal::Result::Success) {
|
||||
// Transmit trace data to anyone who's listening
|
||||
auto devResult = rgp_server_->WriteTraceData(
|
||||
static_cast<Pal::uint8*>(pTraceData), traceDataSize);
|
||||
auto devResult =
|
||||
rgp_server_->WriteTraceData(static_cast<Pal::uint8*>(pTraceData), traceDataSize);
|
||||
|
||||
success = (devResult == DevDriver::Result::Success);
|
||||
}
|
||||
@@ -317,7 +298,7 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
|
||||
}
|
||||
|
||||
if (success) {
|
||||
result = Pal::Result::Success;
|
||||
result = Pal::Result::Success;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -327,9 +308,8 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
|
||||
// ================================================================================================
|
||||
// Called after a swap chain presents. This signals a (next) frame-begin boundary and is
|
||||
// used to coordinate RGP trace start/stop.
|
||||
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
size_t x, size_t y, size_t z)
|
||||
{
|
||||
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y,
|
||||
size_t z) {
|
||||
// Wait for the driver to be resumed in case it's been paused.
|
||||
WaitForDriverResume();
|
||||
|
||||
@@ -347,8 +327,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (trace_.status_ == TraceStatus::Preparing) {
|
||||
} else if (trace_.status_ == TraceStatus::Preparing) {
|
||||
// Wait some number of "preparation frames" before starting the trace in order to get enough
|
||||
// timer samples to sync CPU/GPU clock domains.
|
||||
trace_.prepared_disp_count_++;
|
||||
@@ -370,7 +349,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
// Check if we're ending a trace waiting for SQTT to turn off.
|
||||
// If SQTT has turned off, end the trace
|
||||
else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
|
||||
if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
|
||||
result = EndRGPTrace(gpu);
|
||||
@@ -401,14 +380,17 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
|
||||
if (kernel.prog().isInternal()) {
|
||||
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
|
||||
RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
|
||||
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBufferToImage,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
|
||||
RgpSqttMarkerEventType::CmdScheduler
|
||||
};
|
||||
RgpSqttMarkerEventType::CmdCopyImage,
|
||||
RgpSqttMarkerEventType::CmdCopyImage,
|
||||
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBufferToImage,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdCopyBuffer,
|
||||
RgpSqttMarkerEventType::CmdFillBuffer,
|
||||
RgpSqttMarkerEventType::CmdFillImage,
|
||||
RgpSqttMarkerEventType::CmdScheduler};
|
||||
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
|
||||
if (kernel.name().compare(BlitName[i]) == 0) {
|
||||
apiEvent = ApiEvents[i];
|
||||
@@ -418,8 +400,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
}
|
||||
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
|
||||
// Write disaptch marker
|
||||
WriteEventWithDimsMarker(gpu, apiEvent,
|
||||
static_cast<uint32_t>(x), static_cast<uint32_t>(y), static_cast<uint32_t>(z));
|
||||
WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
|
||||
static_cast<uint32_t>(z));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -428,11 +410,11 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
|
||||
|
||||
// ================================================================================================
|
||||
// This function starts preparing for an RGP trace. Preparation involves some N frames of
|
||||
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
|
||||
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock
|
||||
// domains.
|
||||
//
|
||||
// This function transitions from the Idle state to the Preparing state.
|
||||
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
|
||||
assert(trace_.status_ == TraceStatus::Idle);
|
||||
|
||||
// We can only trace using a single device at a time currently, so recreate RGP trace
|
||||
@@ -441,32 +423,32 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
|
||||
|
||||
const auto traceParameters = rgp_server_->QueryTraceParameters();
|
||||
|
||||
num_prep_disp_ = traceParameters.captureStartIndex;
|
||||
num_prep_disp_ = traceParameters.captureStartIndex;
|
||||
uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
|
||||
// Validate if the captured dispatches are in the range
|
||||
if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
|
||||
max_sqtt_disp_ = capture_disp;
|
||||
}
|
||||
|
||||
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
|
||||
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
|
||||
inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;
|
||||
|
||||
// Notify the RGP server that we are starting a trace
|
||||
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
|
||||
result = Pal::Result::ErrorUnknown;
|
||||
result = Pal::Result::ErrorUnknown;
|
||||
}
|
||||
|
||||
// Tell the GPA session class we're starting a trace
|
||||
if (result == Pal::Result::Success) {
|
||||
GpuUtil::GpaSessionBeginInfo info = {};
|
||||
|
||||
info.flags.enableQueueTiming = true;// trace_.queueTimingEnabled;
|
||||
info.flags.enableQueueTiming = true; // trace_.queueTimingEnabled;
|
||||
|
||||
result = trace_.gpa_session_->Begin(info);
|
||||
}
|
||||
|
||||
trace_.prepared_disp_count_ = 0;
|
||||
trace_.sqtt_disp_count_ = 0;
|
||||
trace_.sqtt_disp_count_ = 0;
|
||||
|
||||
// Sample the timing clocks prior to starting a trace.
|
||||
if (result == Pal::Result::Success) {
|
||||
@@ -476,7 +458,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
|
||||
if (result == Pal::Result::Success) {
|
||||
// Remember which queue started the trace
|
||||
trace_.prepare_queue_ = gpu;
|
||||
trace_.begin_queue_ = nullptr;
|
||||
trace_.begin_queue_ = nullptr;
|
||||
|
||||
trace_.status_ = TraceStatus::Preparing;
|
||||
} else {
|
||||
@@ -497,8 +479,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
|
||||
// the "begin trace" information command buffer.
|
||||
//
|
||||
// This function transitions from the Preparing state to the Running state.
|
||||
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
|
||||
assert(trace_.status_ == TraceStatus::Preparing);
|
||||
assert(trace_enabled_);
|
||||
|
||||
@@ -526,8 +507,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
|
||||
|
||||
// Fill GPU commands
|
||||
gpu->eventBegin(MainEngine);
|
||||
trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
|
||||
gpu->queue(MainEngine).iCmd(), sampleConfig);
|
||||
trace_.gpa_sample_id_ =
|
||||
trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig);
|
||||
gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
|
||||
}
|
||||
|
||||
@@ -540,7 +521,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
|
||||
|
||||
// Make the trace active and remember which queue started it
|
||||
if (result == Pal::Result::Success) {
|
||||
trace_.status_ = TraceStatus::Running;
|
||||
trace_.status_ = TraceStatus::Running;
|
||||
trace_.begin_queue_ = gpu;
|
||||
}
|
||||
|
||||
@@ -551,8 +532,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
|
||||
// This function submits the command buffer to stop SQTT tracing. Full tracing still continues.
|
||||
//
|
||||
// This function transitions from the Running state to the WaitingForSqtt state.
|
||||
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) {
|
||||
assert(trace_.status_ == TraceStatus::Running);
|
||||
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
@@ -593,8 +573,7 @@ Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
|
||||
// This function ends a running RGP trace.
|
||||
//
|
||||
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
|
||||
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
|
||||
{
|
||||
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
|
||||
assert(trace_.status_ == TraceStatus::WaitingForSqtt);
|
||||
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
@@ -629,8 +608,7 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
|
||||
// ================================================================================================
|
||||
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
|
||||
// It frees any dependent resources.
|
||||
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
|
||||
{
|
||||
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
|
||||
if (trace_.prepare_queue_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
@@ -654,26 +632,25 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
|
||||
|
||||
// Reset tracing state to idle
|
||||
trace_.prepared_disp_count_ = 0;
|
||||
trace_.sqtt_disp_count_ = 0;
|
||||
trace_.gpa_sample_id_ = 0;
|
||||
trace_.status_ = TraceStatus::Idle;
|
||||
trace_.prepare_queue_ = nullptr;
|
||||
trace_.begin_queue_ = nullptr;
|
||||
trace_.sqtt_disp_count_ = 0;
|
||||
trace_.gpa_sample_id_ = 0;
|
||||
trace_.status_ = TraceStatus::Idle;
|
||||
trace_.prepare_queue_ = nullptr;
|
||||
trace_.begin_queue_ = nullptr;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Destroys device-persistent RGP resources
|
||||
void RgpCaptureMgr::DestroyRGPTracing()
|
||||
{
|
||||
void RgpCaptureMgr::DestroyRGPTracing() {
|
||||
if (trace_.status_ != TraceStatus::Idle) {
|
||||
FinishRGPTrace(nullptr, true);
|
||||
FinishRGPTrace(nullptr, true);
|
||||
}
|
||||
|
||||
delete user_event_;
|
||||
|
||||
// Destroy the GPA session
|
||||
if (trace_.gpa_session_ != nullptr) {
|
||||
//Util::Destructor(trace_.gpa_session_);
|
||||
// Util::Destructor(trace_.gpa_session_);
|
||||
delete trace_.gpa_session_;
|
||||
trace_.gpa_session_ = nullptr;
|
||||
}
|
||||
@@ -683,18 +660,15 @@ void RgpCaptureMgr::DestroyRGPTracing()
|
||||
|
||||
// ================================================================================================
|
||||
// Returns true if the given device properties/settings support tracing.
|
||||
bool RgpCaptureMgr::GpuSupportsTracing(
|
||||
const Pal::DeviceProperties& props,
|
||||
const Settings& settings)
|
||||
{
|
||||
bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props,
|
||||
const Settings& settings) {
|
||||
return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Called when a new device is created. This will preallocate reusable RGP trace resources
|
||||
// for that device.
|
||||
void RgpCaptureMgr::PostDeviceCreate()
|
||||
{
|
||||
void RgpCaptureMgr::PostDeviceCreate() {
|
||||
amd::ScopedLock traceLock(&trace_mutex_);
|
||||
|
||||
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
|
||||
@@ -714,8 +688,7 @@ void RgpCaptureMgr::PostDeviceCreate()
|
||||
// ================================================================================================
|
||||
// Called prior to a device's being destroyed. This will free persistent RGP trace resources for
|
||||
// that device.
|
||||
void RgpCaptureMgr::PreDeviceDestroy()
|
||||
{
|
||||
void RgpCaptureMgr::PreDeviceDestroy() {
|
||||
amd::ScopedLock traceLock(&trace_mutex_);
|
||||
// If we are idle, we can re-initialize trace resources based on the new device.
|
||||
if (trace_.status_ == TraceStatus::Idle) {
|
||||
@@ -725,9 +698,8 @@ void RgpCaptureMgr::PreDeviceDestroy()
|
||||
|
||||
// ================================================================================================
|
||||
// Sets up an Event marker's basic data.
|
||||
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
|
||||
const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
|
||||
{
|
||||
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu,
|
||||
RgpSqttMarkerEventType api_type) const {
|
||||
RgpSqttMarkerEvent marker = {};
|
||||
|
||||
marker.identifier = RgpSqttMarkerIdentifierEvent;
|
||||
@@ -739,24 +711,19 @@ RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
|
||||
{
|
||||
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const {
|
||||
assert((data_size % sizeof(uint32_t)) == 0);
|
||||
assert((data_size / sizeof(uint32_t)) > 0);
|
||||
|
||||
gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
|
||||
static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
|
||||
gpu->queue(MainEngine)
|
||||
.iCmd()
|
||||
->CmdInsertRgpTraceMarker(static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Inserts an RGP pre-dispatch marker
|
||||
void RgpCaptureMgr::WriteEventWithDimsMarker(
|
||||
const VirtualGPU* gpu,
|
||||
RgpSqttMarkerEventType apiType,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
uint32_t z) const
|
||||
{
|
||||
void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
|
||||
uint32_t x, uint32_t y, uint32_t z) const {
|
||||
assert(apiType != RgpSqttMarkerEventType::Invalid);
|
||||
|
||||
RgpSqttMarkerEventWithDims eventWithDims = {};
|
||||
@@ -771,26 +738,24 @@ void RgpCaptureMgr::WriteEventWithDimsMarker(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void RgpCaptureMgr::WriteBarrierStartMarker(
|
||||
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
|
||||
{
|
||||
void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
|
||||
const Pal::Developer::BarrierData& data) const {
|
||||
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
|
||||
amd::ScopedLock traceLock(&trace_mutex_);
|
||||
RgpSqttMarkerBarrierStart marker = {};
|
||||
|
||||
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
|
||||
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
|
||||
marker.dword02 = data.reason;
|
||||
marker.internal = true;
|
||||
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
|
||||
marker.dword02 = data.reason;
|
||||
marker.internal = true;
|
||||
|
||||
WriteMarker(gpu, &marker, sizeof(marker));
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void RgpCaptureMgr::WriteBarrierEndMarker(
|
||||
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
|
||||
{
|
||||
void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
|
||||
const Pal::Developer::BarrierData& data) const {
|
||||
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
|
||||
amd::ScopedLock traceLock(&trace_mutex_);
|
||||
// Copy the operations part and include the same data from previous markers
|
||||
@@ -799,28 +764,28 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
|
||||
auto operations = data.operations;
|
||||
|
||||
operations.pipelineStalls.u16All |= 0;
|
||||
operations.caches.u16All |= 0;
|
||||
operations.caches.u16All |= 0;
|
||||
|
||||
RgpSqttMarkerBarrierEnd marker = {};
|
||||
|
||||
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
|
||||
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
|
||||
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
|
||||
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
|
||||
|
||||
marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
|
||||
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
|
||||
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
|
||||
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
|
||||
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
|
||||
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
|
||||
marker.invalTcp = operations.caches.invalTcp;
|
||||
marker.invalSqI = operations.caches.invalSqI$;
|
||||
marker.invalSqK = operations.caches.invalSqK$;
|
||||
marker.flushTcc = operations.caches.flushTcc;
|
||||
marker.invalTcc = operations.caches.invalTcc;
|
||||
marker.flushCb = operations.caches.flushCb;
|
||||
marker.invalCb = operations.caches.invalCb;
|
||||
marker.flushDb = operations.caches.flushDb;
|
||||
marker.invalDb = operations.caches.invalDb;
|
||||
marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
|
||||
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
|
||||
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
|
||||
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
|
||||
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
|
||||
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
|
||||
marker.invalTcp = operations.caches.invalTcp;
|
||||
marker.invalSqI = operations.caches.invalSqI$;
|
||||
marker.invalSqK = operations.caches.invalSqK$;
|
||||
marker.flushTcc = operations.caches.flushTcc;
|
||||
marker.invalTcc = operations.caches.invalTcc;
|
||||
marker.flushCb = operations.caches.flushCb;
|
||||
marker.invalCb = operations.caches.invalCb;
|
||||
marker.flushDb = operations.caches.flushDb;
|
||||
marker.invalDb = operations.caches.invalDb;
|
||||
|
||||
marker.numLayoutTransitions = 0;
|
||||
|
||||
@@ -830,9 +795,9 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
|
||||
|
||||
// ================================================================================================
|
||||
// Inserts a user event string marker
|
||||
void RgpCaptureMgr::WriteUserEventMarker(
|
||||
const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
|
||||
{
|
||||
void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu,
|
||||
RgpSqttMarkerUserEventType eventType,
|
||||
const std::string& name) const {
|
||||
memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));
|
||||
|
||||
user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
|
||||
@@ -841,7 +806,8 @@ void RgpCaptureMgr::WriteUserEventMarker(
|
||||
size_t markerSize = sizeof(user_event_->header);
|
||||
|
||||
if ((eventType != RgpSqttMarkerUserEventPop)) {
|
||||
size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
|
||||
size_t strLength =
|
||||
std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
|
||||
for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
|
||||
uint32_t c = static_cast<uint32_t>(name[charIdx]);
|
||||
user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
|
||||
@@ -859,4 +825,4 @@ void RgpCaptureMgr::WriteUserEventMarker(
|
||||
}
|
||||
|
||||
|
||||
}; // namespace vk
|
||||
}; // namespace pal
|
||||
|
||||
@@ -34,42 +34,36 @@
|
||||
#include "gpuopen.h"
|
||||
|
||||
// PAL forward declarations
|
||||
namespace Pal
|
||||
{
|
||||
class ICmdBuffer;
|
||||
class IFence;
|
||||
class IQueueSemaphore;
|
||||
namespace Pal {
|
||||
class ICmdBuffer;
|
||||
class IFence;
|
||||
class IQueueSemaphore;
|
||||
struct PalPublicSettings;
|
||||
}
|
||||
} // namespace Pal
|
||||
|
||||
// GpuUtil forward declarations
|
||||
namespace GpuUtil
|
||||
{
|
||||
namespace GpuUtil {
|
||||
class GpaSession;
|
||||
};
|
||||
|
||||
// GPUOpen forward declarations
|
||||
namespace DevDriver
|
||||
{
|
||||
namespace DevDriver {
|
||||
class DevDriverServer;
|
||||
class IMsgChannel;
|
||||
struct MessageBuffer;
|
||||
|
||||
namespace DriverControlProtocol
|
||||
{
|
||||
namespace DriverControlProtocol {
|
||||
enum struct DeviceClockMode : uint32_t;
|
||||
class HandlerServer;
|
||||
}
|
||||
} // namespace DriverControlProtocol
|
||||
|
||||
namespace SettingsProtocol
|
||||
{
|
||||
namespace SettingsProtocol {
|
||||
class HandlerServer;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace DevDriver
|
||||
|
||||
namespace pal
|
||||
{
|
||||
namespace pal {
|
||||
class Settings;
|
||||
class Device;
|
||||
class VirtualGPU;
|
||||
@@ -77,8 +71,7 @@ class HSAILKernel;
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
|
||||
enum RgpSqttMarkerIdentifier : uint32_t
|
||||
{
|
||||
enum RgpSqttMarkerIdentifier : uint32_t {
|
||||
RgpSqttMarkerIdentifierEvent = 0x0,
|
||||
RgpSqttMarkerIdentifierCbStart = 0x1,
|
||||
RgpSqttMarkerIdentifierCbEnd = 0x2,
|
||||
@@ -98,8 +91,7 @@ enum RgpSqttMarkerIdentifier : uint32_t
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
enum class RgpSqttMarkerEventType : uint32_t
|
||||
{
|
||||
enum class RgpSqttMarkerEventType : uint32_t {
|
||||
CmdNDRangeKernel = 0,
|
||||
CmdScheduler = 1,
|
||||
CmdCopyBuffer = 2,
|
||||
@@ -114,8 +106,7 @@ enum class RgpSqttMarkerEventType : uint32_t
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
enum class RgpSqqtBarrierReason : uint32_t
|
||||
{
|
||||
enum class RgpSqqtBarrierReason : uint32_t {
|
||||
Invalid = 0,
|
||||
MemDependency = 0xC0000000,
|
||||
ProfilingControl = 0xC0000001,
|
||||
@@ -125,129 +116,116 @@ enum class RgpSqqtBarrierReason : uint32_t
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
|
||||
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
|
||||
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
|
||||
// i.e. draws/dispatches (Table 4).
|
||||
struct RgpSqttMarkerEvent
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 3; // Number of extra dwords following this marker
|
||||
uint32_t apiType : 24; // The API type for this command
|
||||
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
|
||||
struct RgpSqttMarkerEvent {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 3; // Number of extra dwords following this marker
|
||||
uint32_t apiType : 24; // The API type for this command
|
||||
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
|
||||
};
|
||||
|
||||
uint32_t dword01; // The first dword
|
||||
uint32_t dword01; // The first dword
|
||||
};
|
||||
|
||||
union
|
||||
{
|
||||
// Some information about the vertex/instance/draw register indices. These values are not
|
||||
union {
|
||||
// Some information about the vertex/instance/draw register indices. These values are not
|
||||
// always valid because they are not available for one reason or another:
|
||||
//
|
||||
// - If vertex offset index or instance offset index are not (together) valid, they are both
|
||||
// equal to 0
|
||||
// - If draw index is not valid, it is equal to the vertex offset index
|
||||
struct
|
||||
{
|
||||
uint32_t cbID : 20; // Command buffer ID for this marker
|
||||
struct {
|
||||
uint32_t cbID : 20; // Command buffer ID for this marker
|
||||
uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset
|
||||
uint32_t instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
|
||||
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw indirect)
|
||||
uint32_t
|
||||
instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
|
||||
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw
|
||||
// indirect)
|
||||
};
|
||||
uint32_t dword02; // The second dword
|
||||
uint32_t dword02; // The second dword
|
||||
};
|
||||
|
||||
union
|
||||
{
|
||||
uint32_t cmdID; // Command index within the command buffer
|
||||
uint32_t dword03; // The third dword
|
||||
union {
|
||||
uint32_t cmdID; // Command index within the command buffer
|
||||
uint32_t dword03; // The third dword
|
||||
};
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
|
||||
struct RgpSqttMarkerEventWithDims
|
||||
{
|
||||
RgpSqttMarkerEvent event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
|
||||
uint32_t threadX; // Work group count in X
|
||||
uint32_t threadY; // Work group count in Y
|
||||
uint32_t threadZ; // Work group count in Z
|
||||
struct RgpSqttMarkerEventWithDims {
|
||||
RgpSqttMarkerEvent
|
||||
event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
|
||||
uint32_t threadX; // Work group count in X
|
||||
uint32_t threadY; // Work group count in Y
|
||||
uint32_t threadZ; // Work group count in Z
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
|
||||
struct RgpSqttMarkerBarrierStart
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
struct RgpSqttMarkerBarrierStart {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 3; // Number of extra dwords following this marker
|
||||
uint32_t cbId : 20; // Command buffer ID within queue
|
||||
uint32_t reserved : 5; // Reserved
|
||||
};
|
||||
|
||||
uint32_t dword01; // The first dword
|
||||
uint32_t dword01; // The first dword
|
||||
};
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
union {
|
||||
struct {
|
||||
uint32_t driverReason : 31;
|
||||
uint32_t internal: 1;
|
||||
uint32_t internal : 1;
|
||||
};
|
||||
|
||||
uint32_t dword02; // The second dword
|
||||
uint32_t dword02; // The second dword
|
||||
};
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
|
||||
struct RgpSqttMarkerBarrierEnd
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 3; // Number of extra dwords following this marker
|
||||
uint32_t cbId : 20; // Command buffer ID within queue
|
||||
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that timestamp
|
||||
// to be written. Quintessential full pipeline stall.
|
||||
struct RgpSqttMarkerBarrierEnd {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 3; // Number of extra dwords following this marker
|
||||
uint32_t cbId : 20; // Command buffer ID within queue
|
||||
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
|
||||
// timestamp to be written. Quintessential full pipeline stall.
|
||||
uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete.
|
||||
uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete.
|
||||
uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete.
|
||||
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
|
||||
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
|
||||
};
|
||||
|
||||
uint32_t dword01; // The first dword
|
||||
uint32_t dword01; // The first dword
|
||||
};
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
|
||||
union {
|
||||
struct {
|
||||
uint32_t
|
||||
syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
|
||||
uint32_t invalTcp : 1; // Invalidate the L1 vector caches.
|
||||
uint32_t invalSqI : 1; // Invalidate the SQ instruction caches
|
||||
uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches)
|
||||
uint32_t flushTcc : 1; // Flush L2
|
||||
uint32_t invalTcc : 1; // Invalidate L2
|
||||
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
|
||||
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
|
||||
uint32_t flushDb : 1; // Flush DB caches (including htile)
|
||||
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
|
||||
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
|
||||
uint32_t reserved : 6; // Reserved for future expansion. Always 0
|
||||
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
|
||||
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
|
||||
uint32_t flushDb : 1; // Flush DB caches (including htile)
|
||||
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
|
||||
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
|
||||
uint32_t reserved : 6; // Reserved for future expansion. Always 0
|
||||
};
|
||||
|
||||
uint32_t dword02; // The second dword
|
||||
uint32_t dword02; // The second dword
|
||||
};
|
||||
};
|
||||
|
||||
@@ -255,33 +233,31 @@ struct RgpSqttMarkerBarrierEnd
|
||||
constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;
|
||||
|
||||
// RGP SQTT Instrumentation Specification version for Vulkan-specific tables
|
||||
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
|
||||
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
|
||||
|
||||
// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user event
|
||||
enum RgpSqttMarkerUserEventType : uint32_t
|
||||
{
|
||||
RgpSqttMarkerUserEventTrigger = 0x0,
|
||||
RgpSqttMarkerUserEventPop = 0x1,
|
||||
RgpSqttMarkerUserEventPush = 0x2,
|
||||
RgpSqttMarkerUserEventObjectName = 0x3,
|
||||
RgpSqttMarkerUserEventReserved1 = 0x4,
|
||||
RgpSqttMarkerUserEventReserved2 = 0x5,
|
||||
RgpSqttMarkerUserEventReserved3 = 0x6,
|
||||
RgpSqttMarkerUserEventReserved4 = 0x7,
|
||||
// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
|
||||
// event
|
||||
enum RgpSqttMarkerUserEventType : uint32_t {
|
||||
RgpSqttMarkerUserEventTrigger = 0x0,
|
||||
RgpSqttMarkerUserEventPop = 0x1,
|
||||
RgpSqttMarkerUserEventPush = 0x2,
|
||||
RgpSqttMarkerUserEventObjectName = 0x3,
|
||||
RgpSqttMarkerUserEventReserved1 = 0x4,
|
||||
RgpSqttMarkerUserEventReserved2 = 0x5,
|
||||
RgpSqttMarkerUserEventReserved3 = 0x6,
|
||||
RgpSqttMarkerUserEventReserved4 = 0x7,
|
||||
};
|
||||
|
||||
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
|
||||
union RgpSqttMarkerUserEvent
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 8; // Number of extra dwords following this marker
|
||||
uint32_t dataType : 8; // The type for this marker
|
||||
uint32_t reserved : 12; // reserved
|
||||
};
|
||||
union RgpSqttMarkerUserEvent {
|
||||
struct {
|
||||
uint32_t identifier : 4; // Identifier for this marker
|
||||
uint32_t extDwords : 8; // Number of extra dwords following this marker
|
||||
uint32_t dataType : 8; // The type for this marker
|
||||
uint32_t reserved : 12; // reserved
|
||||
};
|
||||
|
||||
uint32_t dword01; // The first dword
|
||||
uint32_t dword01; // The first dword
|
||||
};
|
||||
|
||||
constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
|
||||
@@ -289,21 +265,20 @@ constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
|
||||
// The max lengths of frame marker strings
|
||||
static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;
|
||||
|
||||
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and trigger data types)
|
||||
struct RgpSqttMarkerUserEventWithString
|
||||
{
|
||||
RgpSqttMarkerUserEvent header;
|
||||
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
|
||||
// trigger data types)
|
||||
struct RgpSqttMarkerUserEventWithString {
|
||||
RgpSqttMarkerUserEvent header;
|
||||
|
||||
uint32_t stringLength; // Length of the string (in characters)
|
||||
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
|
||||
uint32_t stringLength; // Length of the string (in characters)
|
||||
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// This class provides functionality to interact with the GPU Open Developer Mode message passing
|
||||
// service and the rest of the driver.
|
||||
class RgpCaptureMgr
|
||||
{
|
||||
public:
|
||||
class RgpCaptureMgr {
|
||||
public:
|
||||
~RgpCaptureMgr();
|
||||
|
||||
static RgpCaptureMgr* Create(Pal::IPlatform* platform, const Device& device);
|
||||
@@ -321,45 +296,42 @@ public:
|
||||
|
||||
bool IsQueueTimingActive() const;
|
||||
|
||||
void WriteBarrierStartMarker(
|
||||
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
|
||||
void WriteBarrierEndMarker(
|
||||
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
|
||||
void WriteBarrierStartMarker(const VirtualGPU* gpu,
|
||||
const Pal::Developer::BarrierData& data) const;
|
||||
void WriteBarrierEndMarker(const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
|
||||
bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const;
|
||||
Pal::Result TimedQueueSubmit(
|
||||
Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
|
||||
Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
|
||||
const Pal::SubmitInfo& submitInfo) const;
|
||||
|
||||
private:
|
||||
private:
|
||||
// Steps that an RGP trace goes through
|
||||
enum class TraceStatus
|
||||
{
|
||||
Idle = 0, // No active trace and none requested
|
||||
Preparing, // A trace has been requested but is not active yet because we are
|
||||
// currently sampling timing information over some number of lead frames.
|
||||
Running, // SQTT and queue timing is currently active for all command buffer submits.
|
||||
WaitingForSqtt,
|
||||
WaitingForResults // Tracing is no longer active, but all results are not yet ready.
|
||||
enum class TraceStatus {
|
||||
Idle = 0, // No active trace and none requested
|
||||
Preparing, // A trace has been requested but is not active yet because we are
|
||||
// currently sampling timing information over some number of lead frames.
|
||||
Running, // SQTT and queue timing is currently active for all command buffer submits.
|
||||
WaitingForSqtt,
|
||||
WaitingForResults // Tracing is no longer active, but all results are not yet ready.
|
||||
};
|
||||
|
||||
// All per-device state to support RGP tracing
|
||||
struct TraceState
|
||||
{
|
||||
TraceStatus status_; // Current trace status (idle, running, etc.)
|
||||
struct TraceState {
|
||||
TraceStatus status_; // Current trace status (idle, running, etc.)
|
||||
|
||||
GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
|
||||
|
||||
VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
|
||||
VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
|
||||
VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
|
||||
VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
|
||||
|
||||
GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
|
||||
uint32_t gpa_sample_id_; // Sample ID associated with the current trace
|
||||
bool queue_timing_; // Queue timing is enabled
|
||||
GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
|
||||
uint32_t gpa_sample_id_; // Sample ID associated with the current trace
|
||||
bool queue_timing_; // Queue timing is enabled
|
||||
|
||||
uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
|
||||
uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
|
||||
mutable uint32_t current_event_id_; // Current event ID
|
||||
uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
|
||||
uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
|
||||
mutable uint32_t current_event_id_; // Current event ID
|
||||
};
|
||||
|
||||
RgpCaptureMgr(Pal::IPlatform* platform, const Device& device);
|
||||
@@ -374,25 +346,25 @@ private:
|
||||
static bool GpuSupportsTracing(const Pal::DeviceProperties& props, const Settings& settings);
|
||||
RgpSqttMarkerEvent BuildEventMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const;
|
||||
void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const;
|
||||
void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
|
||||
uint32_t x, uint32_t y, uint32_t z) const;
|
||||
void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x,
|
||||
uint32_t y, uint32_t z) const;
|
||||
void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType,
|
||||
const std::string& name) const;
|
||||
const std::string& name) const;
|
||||
|
||||
const Device& device_;
|
||||
const Device& device_;
|
||||
DevDriver::DevDriverServer* dev_driver_server_;
|
||||
DevDriver::RGPProtocol::RGPServer* rgp_server_;
|
||||
mutable amd::Monitor trace_mutex_;
|
||||
TraceState trace_;
|
||||
mutable amd::Monitor trace_mutex_;
|
||||
TraceState trace_;
|
||||
RgpSqttMarkerUserEventWithString* user_event_;
|
||||
|
||||
uint32_t num_prep_disp_;
|
||||
uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
|
||||
uint32_t trace_gpu_mem_limit_;
|
||||
uint32_t global_disp_count_;
|
||||
uint32_t num_prep_disp_;
|
||||
uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
|
||||
uint32_t trace_gpu_mem_limit_;
|
||||
uint32_t global_disp_count_;
|
||||
|
||||
bool trace_enabled_; // True if tracing is currently enabled (master flag)
|
||||
bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
|
||||
bool trace_enabled_; // True if tracing is currently enabled (master flag)
|
||||
bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
|
||||
|
||||
PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr);
|
||||
PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr);
|
||||
@@ -400,11 +372,9 @@ private:
|
||||
|
||||
// ================================================================================================
|
||||
// Returns true if queue operations are currently being timed by RGP traces.
|
||||
inline bool RgpCaptureMgr::IsQueueTimingActive() const
|
||||
{
|
||||
inline bool RgpCaptureMgr::IsQueueTimingActive() const {
|
||||
return (trace_.queue_timing_ &&
|
||||
(trace_.status_ == TraceStatus::Running ||
|
||||
trace_.status_ == TraceStatus::Preparing ||
|
||||
(trace_.status_ == TraceStatus::Running || trace_.status_ == TraceStatus::Preparing ||
|
||||
trace_.status_ == TraceStatus::WaitingForSqtt));
|
||||
}
|
||||
};
|
||||
}; // namespace pal
|
||||
|
||||
@@ -27,11 +27,9 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
|
||||
namespace pal {
|
||||
|
||||
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
|
||||
const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs,
|
||||
const uint32_t groupSegmentSize, const uint16_t numSGPRs,
|
||||
const uint16_t numVGPRs) {
|
||||
workGroupInfo_.scratchRegs_ =
|
||||
amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
|
||||
workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
|
||||
workGroupInfo_.privateMemSize_ = privateSegmentSize;
|
||||
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
|
||||
workGroupInfo_.usedSGPRs_ = numSGPRs;
|
||||
@@ -63,13 +61,13 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t
|
||||
}
|
||||
|
||||
// Copy code object of this kernel from the program CPU segment
|
||||
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
|
||||
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)),
|
||||
sizeof(amd_kernel_code_t));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
|
||||
amd_kernel_code_t* akc = &akc_;
|
||||
|
||||
if (!setKernelCode(sym, akc)) {
|
||||
@@ -77,18 +75,16 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
}
|
||||
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
||||
reinterpret_cast<void*>(&codeSize_))) {
|
||||
reinterpret_cast<void*>(&codeSize_))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Setup the the workgroup info
|
||||
setWorkGroupInfo(akc->workitem_private_segment_byte_size,
|
||||
akc->workgroup_group_segment_byte_size,
|
||||
akc->wavefront_sgpr_count,
|
||||
akc->workitem_vgpr_count);
|
||||
// Setup the the workgroup info
|
||||
setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
|
||||
akc->wavefront_sgpr_count, akc->workitem_vgpr_count);
|
||||
|
||||
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
|
||||
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
|
||||
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
|
||||
spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
|
||||
|
||||
return true;
|
||||
@@ -102,16 +98,14 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
|
||||
codeSize_(0),
|
||||
workgroupGroupSegmentByteSize_(0),
|
||||
kernargSegmentByteSize_(0),
|
||||
spillSegmentByteSize_(0)
|
||||
{
|
||||
spillSegmentByteSize_(0) {
|
||||
flags_.hsa_ = true;
|
||||
}
|
||||
|
||||
HSAILKernel::~HSAILKernel() {
|
||||
}
|
||||
HSAILKernel::~HSAILKernel() {}
|
||||
|
||||
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
acl_error error = ACL_SUCCESS;
|
||||
std::string openClKernelName = openclMangledName(name());
|
||||
flags_.internalKernel_ =
|
||||
@@ -274,12 +268,14 @@ const HSAILProgram& HSAILKernel::prog() const {
|
||||
return reinterpret_cast<const HSAILProgram&>(prog_);
|
||||
}
|
||||
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
||||
const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
||||
const amd::NDRangeContainer& sizes,
|
||||
const_address parameters,
|
||||
size_t ldsAddress, uint64_t vmDefQueue,
|
||||
uint64_t* vmParentWrap) const {
|
||||
uint64_t argList;
|
||||
address aqlArgBuf = gpu.managedBuffer().reserve(
|
||||
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
|
||||
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
|
||||
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
|
||||
|
||||
if (dynamicParallelism()) {
|
||||
@@ -307,8 +303,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
|
||||
if (sizes.dimensions() >= 2) {
|
||||
offset = sizes.offset()[1];
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
offset = sizes.offset()[1];
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
|
||||
@@ -322,8 +318,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
// and printf buffer was allocated
|
||||
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
|
||||
// and set the fourth argument as the printf_buffer pointer
|
||||
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
|
||||
dbgBuffer()->vmAddress());
|
||||
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
||||
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
|
||||
}
|
||||
@@ -346,11 +341,11 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
// Note: In a case of structs the size won't match,
|
||||
// since HSAIL compiler expects a reference...
|
||||
assert(argsBufferSize() <= signature.paramsSize() &&
|
||||
"A mismatch of sizes of arguments between compiler and runtime!");
|
||||
"A mismatch of sizes of arguments between compiler and runtime!");
|
||||
|
||||
//hsa_kernel_dispatch_packet_t disp;
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
|
||||
gpu.cb(0)->SysMemCopy());
|
||||
// hsa_kernel_dispatch_packet_t disp;
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp =
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
|
||||
|
||||
amd::NDRange local(sizes.local());
|
||||
const amd::NDRange& global = sizes.global();
|
||||
@@ -359,10 +354,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
||||
|
||||
constexpr uint16_t kDispatchPacketHeader =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
hsaDisp->header = kDispatchPacketHeader;
|
||||
hsaDisp->setup = sizes.dimensions();
|
||||
@@ -387,7 +382,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
|
||||
|
||||
if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
||||
gpu.addVmMemory(gpu.hsaQueueMem());
|
||||
}
|
||||
|
||||
@@ -407,7 +402,7 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
@@ -419,7 +414,7 @@ bool LightningKernel::init() {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelMD kernelMD;
|
||||
KernelMD kernelMD;
|
||||
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
|
||||
return false;
|
||||
}
|
||||
@@ -427,8 +422,8 @@ bool LightningKernel::init() {
|
||||
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
|
||||
|
||||
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
|
||||
spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
|
||||
sizeof(uint32_t));
|
||||
spillSegmentByteSize_ =
|
||||
amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, sizeof(uint32_t));
|
||||
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
|
||||
|
||||
// Copy codeobject of this kernel from the program CPU segment
|
||||
@@ -451,7 +446,7 @@ bool LightningKernel::init() {
|
||||
|
||||
// Get the runtime handle symbol GPU address
|
||||
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
uint64_t symbol_address;
|
||||
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
||||
|
||||
@@ -461,19 +456,14 @@ bool LightningKernel::init() {
|
||||
uint64_t kernel_object = gpuAqlCode();
|
||||
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
||||
|
||||
const struct RuntimeHandle runtime_handle = {
|
||||
gpuAqlCode(),
|
||||
spillSegSize(),
|
||||
ldsSize()
|
||||
};
|
||||
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
|
||||
|
||||
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
|
||||
}
|
||||
|
||||
// Setup the the workgroup info
|
||||
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
|
||||
kernelMD.mCodeProps.mGroupSegmentFixedSize,
|
||||
kernelMD.mCodeProps.mNumSGPRs,
|
||||
kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
|
||||
kernelMD.mCodeProps.mNumVGPRs);
|
||||
|
||||
// Copy wavefront size
|
||||
@@ -499,10 +489,10 @@ bool LightningKernel::init() {
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
|
||||
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
|
||||
flags_.internalKernel_ =
|
||||
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
|
||||
@@ -545,7 +535,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
|
||||
// Get the runtime handle symbol GPU address
|
||||
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
uint64_t symbol_address;
|
||||
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
||||
|
||||
@@ -554,11 +544,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
|
||||
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
||||
|
||||
const struct RuntimeHandle runtime_handle = {
|
||||
gpuAqlCode(),
|
||||
spillSegSize(),
|
||||
ldsSize()
|
||||
};
|
||||
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
|
||||
|
||||
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
|
||||
}
|
||||
@@ -584,7 +570,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
|
||||
waveLimiter_.enable();
|
||||
*/
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -20,14 +20,14 @@ namespace amd {
|
||||
namespace hsa {
|
||||
namespace loader {
|
||||
class Symbol;
|
||||
} // loader
|
||||
} // namespace loader
|
||||
namespace code {
|
||||
namespace Kernel {
|
||||
class Metadata;
|
||||
} // Kernel
|
||||
} // code
|
||||
} // hsa
|
||||
} // amd
|
||||
} // namespace Kernel
|
||||
} // namespace code
|
||||
} // namespace hsa
|
||||
} // namespace amd
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace pal {
|
||||
@@ -43,7 +43,6 @@ class LightningProgram;
|
||||
*/
|
||||
class HSAILKernel : public device::Kernel {
|
||||
public:
|
||||
|
||||
HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions);
|
||||
|
||||
virtual ~HSAILKernel();
|
||||
@@ -106,21 +105,19 @@ class HSAILKernel : public device::Kernel {
|
||||
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
|
||||
|
||||
//! Set up the workgroup info based on the kernel metadata
|
||||
void setWorkGroupInfo(const uint32_t privateSegmentSize,
|
||||
const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs,
|
||||
const uint16_t numVGPRs);
|
||||
void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs, const uint16_t numVGPRs);
|
||||
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
|
||||
uint index_; //!< Kernel index in the program
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
|
||||
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
|
||||
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
|
||||
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
|
||||
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
|
||||
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
|
||||
};
|
||||
|
||||
class LightningKernel : public HSAILKernel {
|
||||
@@ -140,4 +137,5 @@ class LightningKernel : public HSAILKernel {
|
||||
#endif
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -23,27 +23,21 @@
|
||||
namespace pal {
|
||||
|
||||
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size)
|
||||
: device::Memory(owner), Resource(gpuDev, size)
|
||||
, pinnedMemory_(nullptr)
|
||||
, parent_(nullptr) {
|
||||
|
||||
: device::Memory(owner), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {
|
||||
if (owner.parent() != nullptr) {
|
||||
flags_ |= SubMemoryObject;
|
||||
}
|
||||
}
|
||||
|
||||
Memory::Memory(const Device& gpuDev, size_t size)
|
||||
: device::Memory(size), Resource(gpuDev, size)
|
||||
, pinnedMemory_(nullptr)
|
||||
, parent_(nullptr) {
|
||||
}
|
||||
: device::Memory(size), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {}
|
||||
|
||||
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth,
|
||||
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
|
||||
: device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
|
||||
, pinnedMemory_(nullptr)
|
||||
, parent_(nullptr) {
|
||||
|
||||
: device::Memory(owner),
|
||||
Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
|
||||
pinnedMemory_(nullptr),
|
||||
parent_(nullptr) {
|
||||
if (owner.parent() != nullptr) {
|
||||
flags_ |= SubMemoryObject;
|
||||
}
|
||||
@@ -51,10 +45,10 @@ Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t he
|
||||
|
||||
Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth,
|
||||
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
|
||||
: device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
|
||||
, pinnedMemory_(nullptr)
|
||||
, parent_(nullptr) {
|
||||
}
|
||||
: device::Memory(size),
|
||||
Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
|
||||
pinnedMemory_(nullptr),
|
||||
parent_(nullptr) {}
|
||||
|
||||
#ifdef _WIN32
|
||||
static HANDLE getSharedHandle(IUnknown* pIface) {
|
||||
@@ -130,7 +124,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
|
||||
break;
|
||||
case Resource::Remote:
|
||||
case Resource::RemoteUSWC:
|
||||
if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
|
||||
if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
|
||||
// Marks memory object for direct GPU access to the host memory
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
}
|
||||
@@ -402,7 +396,7 @@ Memory::~Memory() {
|
||||
(memoryType() != Resource::ExternalPhysical)) {
|
||||
// Unmap memory if direct access was requested
|
||||
// Note: runtime will perform unmap on the actual resource destruction
|
||||
//unmap(nullptr);
|
||||
// unmap(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,12 +32,12 @@ class Memory : public device::Memory, public Resource {
|
||||
Memory(const Device& gpuDev, //!< GPU device object
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
size_t size //!< Memory size for allocation
|
||||
);
|
||||
);
|
||||
|
||||
//! Constructor (nonfat version for local scratch mem use without heap block)
|
||||
Memory(const Device& gpuDev, //!< GPU device object
|
||||
size_t size //!< Memory size for allocation
|
||||
);
|
||||
);
|
||||
|
||||
//! Constructor memory for images (without global heap allocation)
|
||||
Memory(const Device& gpuDev, //!< GPU device object
|
||||
@@ -48,7 +48,7 @@ class Memory : public device::Memory, public Resource {
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
);
|
||||
);
|
||||
|
||||
//! Constructor memory for images (without global heap allocation)
|
||||
Memory(const Device& gpuDev, //!< GPU device object
|
||||
@@ -59,7 +59,7 @@ class Memory : public device::Memory, public Resource {
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
);
|
||||
);
|
||||
|
||||
//! Default destructor
|
||||
~Memory();
|
||||
@@ -70,7 +70,7 @@ class Memory : public device::Memory, public Resource {
|
||||
//! Overloads the resource create method
|
||||
virtual bool create(Resource::MemoryType memType, //!< Memory type
|
||||
Resource::CreateParams* params = NULL //!< Prameters for create
|
||||
);
|
||||
);
|
||||
|
||||
//! Allocate memory for API-level maps
|
||||
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
|
||||
@@ -78,12 +78,12 @@ class Memory : public device::Memory, public Resource {
|
||||
uint mapFlags, //!< Map flags
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
||||
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
||||
);
|
||||
);
|
||||
|
||||
//! Pins system memory associated with this memory object
|
||||
virtual bool pinSystemMemory(void* hostPtr, //!< System memory address
|
||||
size_t size //!< Size of allocated system memory
|
||||
);
|
||||
);
|
||||
|
||||
//! Releases indirect map surface
|
||||
virtual void releaseIndirectMap() { decIndMapCount(); }
|
||||
@@ -96,15 +96,15 @@ class Memory : public device::Memory, public Resource {
|
||||
uint numLayers = 0, //!< End layer for multilayer map
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the device memory
|
||||
size_t* slicePitch = NULL //!< Slice pitch for the device memory
|
||||
);
|
||||
);
|
||||
|
||||
//! Unmap the device memory
|
||||
virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
|
||||
);
|
||||
);
|
||||
|
||||
//! Updates device memory from the owner's host allocation
|
||||
void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
//! Synchronization flags
|
||||
//! Synchronization flags
|
||||
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags());
|
||||
|
||||
//! Updates the owner's host allocation from device memory
|
||||
@@ -115,11 +115,13 @@ class Memory : public device::Memory, public Resource {
|
||||
//! Creates a view from current resource
|
||||
virtual Memory* createBufferView(
|
||||
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
|
||||
);
|
||||
);
|
||||
|
||||
virtual uint64_t virtualAddress() const override { return vmAddress(); }
|
||||
|
||||
virtual const address cpuSrd() const { return reinterpret_cast<const address>(const_cast<void*>(hwState())); }
|
||||
virtual const address cpuSrd() const {
|
||||
return reinterpret_cast<const address>(const_cast<void*>(hwState()));
|
||||
}
|
||||
|
||||
//! Allocates host memory for synchronization with MGPU context
|
||||
void mgpuCacheWriteBack();
|
||||
@@ -161,8 +163,8 @@ class Memory : public device::Memory, public Resource {
|
||||
//! Disable operator=
|
||||
Memory& operator=(const Memory&);
|
||||
|
||||
Memory* pinnedMemory_; //!< Memory used as pinned system memory
|
||||
const Memory* parent_; //!< Parent memory object
|
||||
Memory* pinnedMemory_; //!< Memory used as pinned system memory
|
||||
const Memory* parent_; //!< Parent memory object
|
||||
};
|
||||
|
||||
class Buffer : public pal::Memory {
|
||||
@@ -219,7 +221,7 @@ class Image : public pal::Memory {
|
||||
uint mapFlags, //!< Map flags
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
||||
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
||||
);
|
||||
);
|
||||
|
||||
virtual uint64_t virtualAddress() const override { return hwSrd(); }
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef isinf
|
||||
#ifdef _MSC_VER
|
||||
#define isinf(X) (!_finite(X) && !_isnan(X))
|
||||
#else //!_MSC_VER
|
||||
#else //!_MSC_VER
|
||||
#define isinf(X) (std::isinf(X))
|
||||
#endif //!_MSC_VER
|
||||
#endif // isinf
|
||||
@@ -19,7 +19,7 @@
|
||||
#ifndef isnan
|
||||
#ifdef _MSC_VER
|
||||
#define isnan(X) (_isnan(X))
|
||||
#else //!_MSC_VER
|
||||
#else //!_MSC_VER
|
||||
#define isnan(X) (std::isnan(X))
|
||||
#endif //!_MSC_VER
|
||||
#endif // isnan
|
||||
@@ -55,14 +55,14 @@ class PrintfDbg : public amd::HeapObject {
|
||||
bool init(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const amd::NDRange& size //!< Kernel's workload
|
||||
);
|
||||
);
|
||||
|
||||
//! Prints the kernel's debug informaiton from the buffer
|
||||
bool output(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const amd::NDRange& size, //!< Kernel's workload
|
||||
bool output(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const amd::NDRange& size, //!< Kernel's workload
|
||||
const std::vector<device::PrintfInfo>& printfInfo //!< printf info
|
||||
);
|
||||
);
|
||||
|
||||
//! Debug buffer size per workitem
|
||||
size_t wiDbgSize() const { return wiDbgSize_; }
|
||||
@@ -81,7 +81,7 @@ class PrintfDbg : public amd::HeapObject {
|
||||
|
||||
//! Allocates the debug buffer
|
||||
bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory
|
||||
);
|
||||
);
|
||||
|
||||
//! Returns TRUE if a float value has to be printed
|
||||
bool checkFloat(const std::string& fmt //!< Format string
|
||||
@@ -105,9 +105,9 @@ class PrintfDbg : public amd::HeapObject {
|
||||
) const;
|
||||
|
||||
//! Displays the PrintfDbg
|
||||
void outputDbgBuffer(const device::PrintfInfo& info,//!< printf info
|
||||
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
|
||||
size_t& i //!< index to the data in the buffer
|
||||
void outputDbgBuffer(const device::PrintfInfo& info, //!< printf info
|
||||
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
|
||||
size_t& i //!< index to the data in the buffer
|
||||
) const;
|
||||
|
||||
private:
|
||||
@@ -127,7 +127,7 @@ class PrintfDbg : public amd::HeapObject {
|
||||
uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
size_t idx, //!< Workitem global index
|
||||
bool* realloc //!< Returns TRUE if workitem reached the buffer limit
|
||||
);
|
||||
);
|
||||
|
||||
//! Unamp the staged buffer
|
||||
void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
@@ -145,13 +145,13 @@ class PrintfDbgHSA : public PrintfDbg {
|
||||
//! Initializes the debug buffer before kernel's execution
|
||||
bool init(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled //!< checks for printf
|
||||
);
|
||||
);
|
||||
|
||||
//! Prints the kernel's debug informaiton from the buffer
|
||||
bool output(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
bool output(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const std::vector<device::PrintfInfo>& printfInfo //!< printf info
|
||||
);
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
@@ -161,4 +161,5 @@ class PrintfDbgHSA : public PrintfDbg {
|
||||
PrintfDbgHSA& operator=(const PrintfDbgHSA&);
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -65,10 +65,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
|
||||
align = amd::alignUp(align, sizeof(uint32_t));
|
||||
|
||||
amd::Memory* amd_mem_obj = new (prog.dev().context())
|
||||
amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
|
||||
// HIP requires SVM allocation for segment code due to possible global variable access and
|
||||
// global variables are a part of code segment with the latest loader
|
||||
amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
|
||||
amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
|
||||
// HIP requires SVM allocation for segment code due to possible global variable
|
||||
// access and global variables are a part of code segment with the latest loader
|
||||
amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
|
||||
|
||||
if (amd_mem_obj == nullptr) {
|
||||
LogError("[OCL] failed to create a mem object!");
|
||||
@@ -103,9 +103,9 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
|
||||
|
||||
if (zero && !prog.isInternal()) {
|
||||
uint64_t pattern = 0;
|
||||
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
|
||||
prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
|
||||
amd::Coord3D(0), amd::Coord3D(size));
|
||||
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
|
||||
prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
|
||||
amd::Coord3D(size));
|
||||
}
|
||||
|
||||
switch (segment) {
|
||||
@@ -237,7 +237,7 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
|
||||
}
|
||||
|
||||
bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
// ACL_TYPE_CG stage is not performed for offline compilation
|
||||
hsa_agent_t agent;
|
||||
agent.handle = 1;
|
||||
@@ -262,8 +262,8 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
|
||||
}
|
||||
|
||||
size_t kernelNamesSize = 0;
|
||||
acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES,
|
||||
nullptr, nullptr, &kernelNamesSize);
|
||||
acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
|
||||
nullptr, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
||||
return false;
|
||||
@@ -274,11 +274,11 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
|
||||
&kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
||||
delete [] kernelNames;
|
||||
delete[] kernelNames;
|
||||
return false;
|
||||
}
|
||||
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
|
||||
delete [] kernelNames;
|
||||
delete[] kernelNames;
|
||||
bool dynamicParallelism = false;
|
||||
for (const auto& it : vKernels) {
|
||||
std::string kernelName(it);
|
||||
@@ -338,12 +338,10 @@ bool HSAILProgram::allocKernelTable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const {
|
||||
gpu.addVmMemory(&codeSegGpu());
|
||||
}
|
||||
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); }
|
||||
|
||||
const aclTargetInfo& HSAILProgram::info(const char* str) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
acl_error err;
|
||||
std::string arch = "hsail";
|
||||
if (dev().settings().use64BitPtr_) {
|
||||
@@ -359,7 +357,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
|
||||
}
|
||||
|
||||
bool HSAILProgram::saveBinaryAndSetType(type_t type) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
// Write binary to memory
|
||||
if (rawBinary_ != nullptr) {
|
||||
// Free memory containing rawBinary
|
||||
@@ -378,8 +376,8 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
|
||||
size_t* bytes, const char* global_name) const {
|
||||
bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes,
|
||||
const char* global_name) const {
|
||||
uint32_t length = 0;
|
||||
size_t offset = 0;
|
||||
uint32_t flags = 0;
|
||||
@@ -456,7 +454,7 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
|
||||
}
|
||||
|
||||
/* Retrieve the Offset from global pal::Memory created @ segment::alloc */
|
||||
if(!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
|
||||
if (!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
|
||||
buildLog_ += "Error: Cannot Retrieve the Address Offset";
|
||||
buildLog_ += "\n";
|
||||
return false;
|
||||
@@ -484,13 +482,12 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
|
||||
|
||||
hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
|
||||
hsa_isa_t isa = {0};
|
||||
uint32_t gfxip = 0;
|
||||
uint32_t gfxip = 0;
|
||||
std::string gfx_target(name);
|
||||
if (gfx_target.find("amdgcn-") == 0) {
|
||||
std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3);
|
||||
gfxip = std::atoi(gfxip_version_str.c_str());
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// FIXME: Old way. To be remove.
|
||||
uint32_t shift = 1;
|
||||
size_t last = gfx_target.length();
|
||||
@@ -508,9 +505,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
|
||||
}
|
||||
|
||||
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
||||
uint32_t gfxipVersion = program_->dev().settings().useLightning_ ?
|
||||
program_->dev().hwInfo()->gfxipVersionLC_ :
|
||||
program_->dev().hwInfo()->gfxipVersion_;
|
||||
uint32_t gfxipVersion = program_->dev().settings().useLightning_
|
||||
? program_->dev().hwInfo()->gfxipVersionLC_
|
||||
: program_->dev().hwInfo()->gfxipVersion_;
|
||||
uint32_t majorSrc = gfxipVersion / 10;
|
||||
uint32_t minorSrc = gfxipVersion % 10;
|
||||
|
||||
@@ -519,11 +516,9 @@ bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
|
||||
|
||||
if (majorSrc != majorTrg) {
|
||||
return false;
|
||||
}
|
||||
else if (minorTrg == minorSrc) {
|
||||
} else if (minorTrg == minorSrc) {
|
||||
return true;
|
||||
}
|
||||
else if (minorTrg < minorSrc) {
|
||||
} else if (minorTrg < minorSrc) {
|
||||
LogWarning("ISA downgrade for execution!");
|
||||
return true;
|
||||
}
|
||||
@@ -708,7 +703,7 @@ static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executabl
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
|
||||
bool LightningProgram::createBinary(amd::option::Options* options) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
@@ -716,7 +711,7 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
|
||||
LogError("Failed to create ELF binary image!");
|
||||
return false;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -752,10 +747,10 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
}
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
for (const auto& kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel = new LightningKernel(kernelName, this,
|
||||
options->origOptionStr + ProcessOptions(options));
|
||||
auto kernel =
|
||||
new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
if (!kernel->init()) {
|
||||
@@ -804,9 +799,9 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
maxScratchRegs_ =
|
||||
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
DestroySegmentCpuAccess();
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -9,15 +9,15 @@
|
||||
namespace amd {
|
||||
namespace option {
|
||||
class Options;
|
||||
} // option
|
||||
} // namespace option
|
||||
namespace hsa {
|
||||
namespace loader {
|
||||
class Loader;
|
||||
class Executable;
|
||||
class Context;
|
||||
} // loader
|
||||
} // hsa
|
||||
} // amd
|
||||
} // namespace loader
|
||||
} // namespace hsa
|
||||
} // namespace amd
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace pal {
|
||||
@@ -50,15 +50,16 @@ class Segment : public amd::HeapObject {
|
||||
bool gpuAddressOffset(uint64_t offAddr, size_t* offset);
|
||||
|
||||
//! Returns address for CPU access in the segment
|
||||
void* cpuAddress(size_t offset) const
|
||||
{ return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; }
|
||||
void* cpuAddress(size_t offset) const {
|
||||
return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset;
|
||||
}
|
||||
|
||||
void DestroyCpuAccess();
|
||||
|
||||
private:
|
||||
Memory* gpuAccess_; //!< GPU memory for segment access
|
||||
Memory* cpuAccess_; //!< CPU memory for segment (backing store)
|
||||
address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
|
||||
Memory* gpuAccess_; //!< GPU memory for segment access
|
||||
Memory* cpuAccess_; //!< CPU memory for segment (backing store)
|
||||
address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
|
||||
};
|
||||
|
||||
class PALHSALoaderContext final : public Context {
|
||||
@@ -166,7 +167,7 @@ class HSAILProgram : public device::Program {
|
||||
}
|
||||
|
||||
//! Get symbol by name
|
||||
amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
|
||||
amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
|
||||
return executable_->GetSymbol(symbol_name, agent);
|
||||
}
|
||||
|
||||
@@ -180,11 +181,14 @@ class HSAILProgram : public device::Program {
|
||||
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize) override;
|
||||
|
||||
//! Destroys CPU allocations in the code segment
|
||||
void DestroySegmentCpuAccess() const
|
||||
{ if (codeSegment_ != nullptr) { codeSegment_->DestroyCpuAccess(); } }
|
||||
void DestroySegmentCpuAccess() const {
|
||||
if (codeSegment_ != nullptr) {
|
||||
codeSegment_->DestroyCpuAccess();
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr,
|
||||
size_t* bytes, const char* globalName) const;
|
||||
virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, size_t* bytes,
|
||||
const char* globalName) const;
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
@@ -201,7 +205,7 @@ class HSAILProgram : public device::Program {
|
||||
std::vector<Memory*> globalStores_; //!< Global memory for the program
|
||||
Memory* kernels_; //!< Table with kernel object pointers
|
||||
Memory* codeSegGpu_; //!< GPU memory with code objects
|
||||
Segment* codeSegment_; //!< Pointer to the code segment for this program
|
||||
Segment* codeSegment_; //!< Pointer to the code segment for this program
|
||||
uint
|
||||
maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
|
||||
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
|
||||
@@ -214,19 +218,17 @@ class HSAILProgram : public device::Program {
|
||||
//! \class Lightning Compiler Program
|
||||
class LightningProgram : public HSAILProgram {
|
||||
public:
|
||||
LightningProgram(NullDevice& device)
|
||||
: HSAILProgram(device) {
|
||||
isLC_ = true;
|
||||
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
||||
machineTarget_ = dev().hwInfo()->machineTargetLC_;
|
||||
}
|
||||
LightningProgram(NullDevice& device) : HSAILProgram(device) {
|
||||
isLC_ = true;
|
||||
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
||||
machineTarget_ = dev().hwInfo()->machineTargetLC_;
|
||||
}
|
||||
|
||||
LightningProgram(Device& device)
|
||||
: HSAILProgram(device) {
|
||||
isLC_ = true;
|
||||
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
||||
machineTarget_ = dev().hwInfo()->machineTargetLC_;
|
||||
}
|
||||
LightningProgram(Device& device) : HSAILProgram(device) {
|
||||
isLC_ = true;
|
||||
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
|
||||
machineTarget_ = dev().hwInfo()->machineTargetLC_;
|
||||
}
|
||||
virtual ~LightningProgram() {}
|
||||
|
||||
protected:
|
||||
@@ -235,4 +237,5 @@ class LightningProgram : public HSAILProgram {
|
||||
virtual bool createBinary(amd::option::Options* options) override;
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -41,8 +41,8 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
if (memRef != nullptr) {
|
||||
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
|
||||
if ((result != Pal::Result::Success) &&
|
||||
// Free cache if PAL failed allocation
|
||||
dev.resourceCache().free()) {
|
||||
// Free cache if PAL failed allocation
|
||||
dev.resourceCache().free()) {
|
||||
// If cache was freed, then try to allocate again
|
||||
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
|
||||
}
|
||||
@@ -154,8 +154,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
|
||||
// ================================================================================================
|
||||
GpuMemoryReference::GpuMemoryReference(const Device& dev)
|
||||
: gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr)
|
||||
{}
|
||||
: gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {}
|
||||
|
||||
// ================================================================================================
|
||||
GpuMemoryReference::~GpuMemoryReference() {
|
||||
@@ -181,8 +180,7 @@ GpuMemoryReference::~GpuMemoryReference() {
|
||||
iMem()->Unmap();
|
||||
}
|
||||
if (0 != iMem()) {
|
||||
if (!(iMem()->Desc().flags.isShared ||
|
||||
iMem()->Desc().flags.isExternal ||
|
||||
if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal ||
|
||||
iMem()->Desc().flags.isExternPhys)) {
|
||||
// Update free memory size counters
|
||||
device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true);
|
||||
@@ -368,7 +366,7 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
|
||||
case Persistent:
|
||||
createInfo->heapCount = 2;
|
||||
createInfo->heaps[0] = Pal::GpuHeapLocal;
|
||||
createInfo->heaps[1] = Pal:: GpuHeapGartUswc;
|
||||
createInfo->heaps[1] = Pal::GpuHeapGartUswc;
|
||||
#ifdef ATI_OS_LINUX
|
||||
// Note: SSG in Linux requires DGMA heap
|
||||
if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) {
|
||||
@@ -401,11 +399,10 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Resource::CreateImage(CreateParams* params)
|
||||
{
|
||||
bool Resource::CreateImage(CreateParams* params) {
|
||||
Pal::Result result;
|
||||
Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
|
||||
Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
|
||||
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
|
||||
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
|
||||
Pal::ChannelMapping channels;
|
||||
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
|
||||
|
||||
@@ -417,8 +414,7 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
memRef_->retain();
|
||||
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
|
||||
offset_ += viewOwner_->offset_;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Pal::GpuMemoryCreateInfo createInfo = {};
|
||||
createInfo.size = desc().width_ * elementSize();
|
||||
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
|
||||
@@ -427,8 +423,8 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
createInfo.priority = Pal::GpuMemPriority::Normal;
|
||||
memTypeToHeap(&createInfo);
|
||||
// createInfo.priority;
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
|
||||
createInfo.alignment, nullptr, &subOffset_);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
|
||||
nullptr, &subOffset_);
|
||||
if (nullptr == memRef_) {
|
||||
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
|
||||
if (nullptr == memRef_) {
|
||||
@@ -477,16 +473,16 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
imgCreateInfo.arraySize = 1;
|
||||
|
||||
switch (desc_.topology_) {
|
||||
case CL_MEM_OBJECT_IMAGE3D:
|
||||
imgCreateInfo.imageType = Pal::ImageType::Tex3d;
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex3d;
|
||||
break;
|
||||
case CL_MEM_OBJECT_IMAGE1D:
|
||||
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
|
||||
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
|
||||
imgCreateInfo.imageType = Pal::ImageType::Tex1d;
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex1d;
|
||||
break;
|
||||
case CL_MEM_OBJECT_IMAGE3D:
|
||||
imgCreateInfo.imageType = Pal::ImageType::Tex3d;
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex3d;
|
||||
break;
|
||||
case CL_MEM_OBJECT_IMAGE1D:
|
||||
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
|
||||
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
|
||||
imgCreateInfo.imageType = Pal::ImageType::Tex1d;
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex1d;
|
||||
break;
|
||||
}
|
||||
if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
|
||||
ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
|
||||
@@ -504,8 +500,7 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
ImgSubresRange.startSubres.arraySlice = imageView->layer_;
|
||||
viewOwner_ = imageView->resource_;
|
||||
image_ = viewOwner_->image_;
|
||||
}
|
||||
else if (memoryType() == ImageBuffer) {
|
||||
} else if (memoryType() == ImageBuffer) {
|
||||
ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
|
||||
viewOwner_ = imageBuffer->resource_;
|
||||
}
|
||||
@@ -515,11 +510,11 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
ImgSubresRange.numMips = desc().mipLevels_;
|
||||
|
||||
if ((memoryType() != ImageView) ||
|
||||
//! @todo PAL doesn't allow an SRD view creation with different pixel size
|
||||
(elementSize() != viewOwner_->elementSize())) {
|
||||
//! @todo PAL doesn't allow an SRD view creation with different pixel size
|
||||
(elementSize() != viewOwner_->elementSize())) {
|
||||
imgCreateInfo.usageFlags.shaderRead = true;
|
||||
imgCreateInfo.usageFlags.shaderWrite =
|
||||
(format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
|
||||
(format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
|
||||
imgCreateInfo.swizzledFormat.format = format;
|
||||
imgCreateInfo.swizzledFormat.swizzle = channels;
|
||||
imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
|
||||
@@ -529,10 +524,9 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
uint32_t rowPitch = 0;
|
||||
|
||||
if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) ||
|
||||
(memoryType() == ImageBuffer)) {
|
||||
(memoryType() == ImageBuffer)) {
|
||||
tiling = Pal::ImageTiling::Linear;
|
||||
}
|
||||
else if (memoryType() == ImageView) {
|
||||
} else if (memoryType() == ImageView) {
|
||||
tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
|
||||
// Find the new pitch in pixels for the new format
|
||||
rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize();
|
||||
@@ -540,10 +534,9 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
|
||||
if (memoryType() == ImageBuffer) {
|
||||
if ((params->owner_ != NULL) && params->owner_->asImage() &&
|
||||
(params->owner_->asImage()->getRowPitch() != 0)) {
|
||||
(params->owner_->asImage()->getRowPitch() != 0)) {
|
||||
rowPitch = params->owner_->asImage()->getRowPitch() / elementSize();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
rowPitch = desc().width_;
|
||||
}
|
||||
}
|
||||
@@ -579,8 +572,8 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
createInfo.priority = Pal::GpuMemPriority::Normal;
|
||||
memTypeToHeap(&createInfo);
|
||||
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
|
||||
createInfo.alignment, nullptr, &subOffset_);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
|
||||
nullptr, &subOffset_);
|
||||
if (nullptr == memRef_) {
|
||||
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
|
||||
if (nullptr == memRef_) {
|
||||
@@ -589,8 +582,7 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
}
|
||||
}
|
||||
offset_ += static_cast<size_t>(subOffset_);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
memRef_ = viewOwner_->memRef_;
|
||||
memRef_->retain();
|
||||
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
|
||||
@@ -627,11 +619,10 @@ bool Resource::CreateImage(CreateParams* params)
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Resource::CreateInterop(CreateParams* params)
|
||||
{
|
||||
bool Resource::CreateInterop(CreateParams* params) {
|
||||
Pal::Result result;
|
||||
Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
|
||||
Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
|
||||
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
|
||||
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
|
||||
Pal::ChannelMapping channels;
|
||||
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
|
||||
Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {};
|
||||
@@ -645,21 +636,21 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
OGLInteropParams* oglRes = reinterpret_cast<OGLInteropParams*>(params);
|
||||
assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
|
||||
switch (oglRes->type_) {
|
||||
case InteropVertexBuffer:
|
||||
glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
|
||||
break;
|
||||
case InteropRenderBuffer:
|
||||
glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
|
||||
break;
|
||||
case InteropTexture:
|
||||
case InteropTextureViewLevel:
|
||||
case InteropTextureViewCube:
|
||||
glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown OGL interop type!");
|
||||
return false;
|
||||
break;
|
||||
case InteropVertexBuffer:
|
||||
glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
|
||||
break;
|
||||
case InteropRenderBuffer:
|
||||
glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
|
||||
break;
|
||||
case InteropTexture:
|
||||
case InteropTextureViewLevel:
|
||||
case InteropTextureViewCube:
|
||||
glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown OGL interop type!");
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
glPlatformContext_ = oglRes->glPlatformContext_;
|
||||
layer = oglRes->layer_;
|
||||
@@ -667,17 +658,18 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
mipLevel = oglRes->mipLevel_;
|
||||
|
||||
if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_,
|
||||
&openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
|
||||
&openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
|
||||
#ifdef ATI_OS_WIN
|
||||
, openInfo.doppDesktopInfo
|
||||
,
|
||||
openInfo.doppDesktopInfo
|
||||
#endif
|
||||
)) {
|
||||
)) {
|
||||
return false;
|
||||
}
|
||||
desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0);
|
||||
format = dev().getPalFormat(desc().format_, &channels);
|
||||
}
|
||||
#ifdef ATI_OS_WIN
|
||||
#ifdef ATI_OS_WIN
|
||||
else {
|
||||
D3DInteropParams* d3dRes = reinterpret_cast<D3DInteropParams*>(params);
|
||||
openInfo.hExternalResource = d3dRes->handle_;
|
||||
@@ -713,8 +705,8 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
size_t gpuMemSize;
|
||||
|
||||
if (Pal::Result::Success !=
|
||||
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
|
||||
&imgCreateInfo)) {
|
||||
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
|
||||
&imgCreateInfo)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -736,51 +728,51 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch;
|
||||
|
||||
switch (misc) {
|
||||
case 1: // NV12 or P010 formats
|
||||
switch (layer) {
|
||||
case -1:
|
||||
case 0:
|
||||
case 1: // NV12 or P010 formats
|
||||
switch (layer) {
|
||||
case -1:
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
// Y - plane size to the offset
|
||||
// NV12 format. UV is 2 times smaller plane Y
|
||||
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
|
||||
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown Interop View Type");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
// Y - plane size to the offset
|
||||
// NV12 format. UV is 2 times smaller plane Y
|
||||
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
|
||||
case 2: // YV12 format
|
||||
switch (layer) {
|
||||
case -1:
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
// Y - plane size to the offset
|
||||
// YV12 format. U is 4 times smaller plane than Y
|
||||
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
|
||||
imgCreateInfo.rowPitch >>= 1;
|
||||
break;
|
||||
case 2:
|
||||
// Y + U plane sizes to the offest.
|
||||
// U plane is 4 times smaller than Y and U == V
|
||||
viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
|
||||
imgCreateInfo.rowPitch >>= 1;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown Interop View Type");
|
||||
return false;
|
||||
}
|
||||
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
|
||||
break;
|
||||
case 3: // YUY2 format
|
||||
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown Interop View Type");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case 2: // YV12 format
|
||||
switch (layer) {
|
||||
case -1:
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
// Y - plane size to the offset
|
||||
// YV12 format. U is 4 times smaller plane than Y
|
||||
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
|
||||
imgCreateInfo.rowPitch >>= 1;
|
||||
break;
|
||||
case 2:
|
||||
// Y + U plane sizes to the offest.
|
||||
// U plane is 4 times smaller than Y and U == V
|
||||
viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
|
||||
imgCreateInfo.rowPitch >>= 1;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown Interop View Type");
|
||||
return false;
|
||||
}
|
||||
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
|
||||
break;
|
||||
case 3: // YUY2 format
|
||||
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
|
||||
break;
|
||||
default:
|
||||
LogError("Unknown Interop View Type");
|
||||
return false;
|
||||
}
|
||||
|
||||
imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
|
||||
@@ -820,8 +812,7 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
hwState_[10] = static_cast<uint32_t>(desc().width_);
|
||||
hwState_[11] = 0; // one extra reserved field in the argument
|
||||
}
|
||||
}
|
||||
else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
|
||||
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
|
||||
memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
|
||||
if (nullptr == memRef_) {
|
||||
return false;
|
||||
@@ -842,8 +833,7 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
hwState_[9] = GetHSAILImageOrderType(desc().format_);
|
||||
hwState_[10] = static_cast<uint32_t>(desc().width_);
|
||||
hwState_[11] = 0; // one extra reserved field in the argument
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Pal::ExternalImageOpenInfo imgOpenInfo = {};
|
||||
Pal::ImageCreateInfo imgCreateInfo = {};
|
||||
imgOpenInfo.resourceInfo = openInfo;
|
||||
@@ -865,14 +855,14 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex2d;
|
||||
switch (imgCreateInfo.imageType) {
|
||||
case Pal::ImageType::Tex3d:
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex3d;
|
||||
break;
|
||||
case Pal::ImageType::Tex1d:
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex1d;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case Pal::ImageType::Tex3d:
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex3d;
|
||||
break;
|
||||
case Pal::ImageType::Tex1d:
|
||||
viewInfo.viewType = Pal::ImageViewType::Tex1d;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
viewInfo.pImage = image_;
|
||||
viewInfo.swizzledFormat.format = format;
|
||||
@@ -897,14 +887,13 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
//! It's a workaround for D24S8 format, since PAL doesn't support this format
|
||||
//! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility
|
||||
if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) &&
|
||||
(desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
|
||||
if (dev().settings().gfx10Plus_) {
|
||||
hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
|
||||
}
|
||||
else {
|
||||
hwState_[1] &= ~0x3c000000;
|
||||
hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
|
||||
}
|
||||
(desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
|
||||
if (dev().settings().gfx10Plus_) {
|
||||
hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
|
||||
} else {
|
||||
hwState_[1] &= ~0x3c000000;
|
||||
hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
|
||||
}
|
||||
}
|
||||
hwState_[8] = GetHSAILImageFormatType(desc().format_);
|
||||
hwState_[9] = GetHSAILImageOrderType(desc().format_);
|
||||
@@ -915,8 +904,7 @@ bool Resource::CreateInterop(CreateParams* params)
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Resource::CreatePinned(CreateParams* params)
|
||||
{
|
||||
bool Resource::CreatePinned(CreateParams* params) {
|
||||
PinnedParams* pinned = reinterpret_cast<PinnedParams*>(params);
|
||||
size_t allocSize = pinned->size_;
|
||||
const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_;
|
||||
@@ -926,7 +914,7 @@ bool Resource::CreatePinned(CreateParams* params)
|
||||
if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
|
||||
// Allign offset to 4K boundary (Vista/Win7 limitation)
|
||||
char* tmpHost = const_cast<char*>(
|
||||
amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
|
||||
amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
|
||||
|
||||
// Find the partial size for unaligned copy
|
||||
hostMemOffset = static_cast<uint>(reinterpret_cast<const char*>(address_) - tmpHost);
|
||||
@@ -940,18 +928,16 @@ bool Resource::CreatePinned(CreateParams* params)
|
||||
}
|
||||
allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
|
||||
// hostMemOffset &= ~(0xff);
|
||||
}
|
||||
else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
|
||||
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
|
||||
//! @todo: Width has to be aligned for 3D.
|
||||
//! Need to be replaced with a compute copy
|
||||
// Width aligned by 8 texels
|
||||
if (((desc().width_ % 0x8) != 0) ||
|
||||
// Pitch aligned by 64 bytes
|
||||
(((desc().width_ * elementSize()) % 0x40) != 0)) {
|
||||
// Pitch aligned by 64 bytes
|
||||
(((desc().width_ * elementSize()) % 0x40) != 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
//! @todo GSL doesn't support pinning with resAlloc_
|
||||
return false;
|
||||
}
|
||||
@@ -978,8 +964,7 @@ bool Resource::CreatePinned(CreateParams* params)
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
|
||||
{
|
||||
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) {
|
||||
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
|
||||
size_t allocSize = amd::alignUp(desc().width_ * elementSize_,
|
||||
dev().properties().gpuMemoryProperties.fragmentSize);
|
||||
@@ -991,20 +976,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
|
||||
if (svmPtr != 0) {
|
||||
createInfo.flags.useReservedGpuVa = true;
|
||||
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
createInfo.flags.useReservedGpuVa = false;
|
||||
createInfo.pReservedGpuVaOwner = nullptr;
|
||||
}
|
||||
if (!dev().settings().svmFineGrainSystem_) {
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
|
||||
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
|
||||
createInfo.pReservedGpuVaOwner, &subOffset_);
|
||||
}
|
||||
if (memRef_ == nullptr) {
|
||||
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Pal::GpuMemoryCreateInfo createInfo = {};
|
||||
createInfo.size = allocSize;
|
||||
createInfo.alignment = MaxGpuAlignment;
|
||||
@@ -1015,8 +998,8 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
|
||||
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
|
||||
}
|
||||
memTypeToHeap(&createInfo);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
|
||||
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
|
||||
createInfo.pReservedGpuVaOwner, &subOffset_);
|
||||
if (memRef_ == nullptr) {
|
||||
createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
|
||||
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
|
||||
@@ -1028,9 +1011,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
|
||||
}
|
||||
desc_.cardMemory_ = false;
|
||||
if ((nullptr != params) && (nullptr != params->owner_) &&
|
||||
(nullptr != params->owner_->getSvmPtr())) {
|
||||
(nullptr != params->owner_->getSvmPtr())) {
|
||||
params->owner_->setSvmPtr(
|
||||
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
|
||||
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
|
||||
offset_ += static_cast<size_t>(subOffset_);
|
||||
}
|
||||
return true;
|
||||
@@ -1126,18 +1109,18 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
Pal::gpusize svmPtr = 0;
|
||||
if ((nullptr != params) && (nullptr != params->owner_) &&
|
||||
(nullptr != params->owner_->getSvmPtr())) {
|
||||
svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
|
||||
desc_.SVMRes_ = true;
|
||||
svmPtr = (svmPtr == 1) ? 0 : svmPtr;
|
||||
svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
|
||||
desc_.SVMRes_ = true;
|
||||
svmPtr = (svmPtr == 1) ? 0 : svmPtr;
|
||||
}
|
||||
if (desc_.SVMRes_) {
|
||||
return CreateSvm(params, svmPtr);
|
||||
return CreateSvm(params, svmPtr);
|
||||
}
|
||||
|
||||
Pal::GpuMemoryCreateInfo createInfo = {};
|
||||
createInfo.size = desc().width_ * elementSize_;
|
||||
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
|
||||
createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment;
|
||||
createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment;
|
||||
createInfo.vaRange = Pal::VaRange::Default;
|
||||
createInfo.priority = Pal::GpuMemPriority::Normal;
|
||||
|
||||
@@ -1152,8 +1135,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
|
||||
memTypeToHeap(&createInfo);
|
||||
// createInfo.priority;
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
|
||||
createInfo.alignment, nullptr, &subOffset_);
|
||||
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
|
||||
nullptr, &subOffset_);
|
||||
if (nullptr == memRef_) {
|
||||
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
|
||||
if (nullptr == memRef_) {
|
||||
@@ -1172,14 +1155,13 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Resource::free()
|
||||
{
|
||||
void Resource::free() {
|
||||
if (memRef_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
const bool wait =
|
||||
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
|
||||
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
|
||||
|
||||
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
|
||||
// and resource can be reused on another async queue without a wait on a busy operation
|
||||
@@ -1190,8 +1172,7 @@ void Resource::free()
|
||||
for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
|
||||
dev().vgpus()[idx]->waitForEvent(&events_[idx]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
amd::ScopedLock l(memRef_->gpu_->execution());
|
||||
memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
|
||||
}
|
||||
@@ -1232,8 +1213,7 @@ void Resource::free()
|
||||
|
||||
// ================================================================================================
|
||||
void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data,
|
||||
bool waitForEvent) const
|
||||
{
|
||||
bool waitForEvent) const {
|
||||
GpuEvent event;
|
||||
|
||||
// Write data size bytes to surface
|
||||
@@ -1242,7 +1222,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
|
||||
gpu.eventBegin(MainEngine);
|
||||
gpu.queue(MainEngine).addCmdMemRef(memRef());
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size,
|
||||
reinterpret_cast<const uint32_t*>(data));
|
||||
reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.eventEnd(MainEngine, event);
|
||||
|
||||
if (waitForEvent) {
|
||||
@@ -1259,8 +1239,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
|
||||
{
|
||||
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) {
|
||||
if (bytesPerElement == 16) {
|
||||
return Pal::ChNumFormat::X32Y32Z32W32_Uint;
|
||||
} else if (bytesPerElement == 8) {
|
||||
@@ -1292,8 +1271,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
if (desc().buffer_ && !dstResource.desc().buffer_) {
|
||||
imageOffsetx = dstOrigin[0] % dstResource.elementSize();
|
||||
gpuMemoryOffset = srcOrigin[0] + offset();
|
||||
gpuMemoryRowPitch =
|
||||
(srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
|
||||
gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
|
||||
img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
|
||||
img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
|
||||
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
|
||||
@@ -1374,7 +1352,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
}
|
||||
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
|
||||
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
|
||||
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2]
|
||||
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
|
||||
? dstOrigin[2]
|
||||
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
|
||||
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region);
|
||||
} else {
|
||||
@@ -1819,17 +1798,14 @@ void Resource::unmap(VirtualGPU* gpu) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Resource::unmapLayers(VirtualGPU* gpu) {
|
||||
Unimplemented();
|
||||
}
|
||||
void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); }
|
||||
|
||||
// ================================================================================================
|
||||
bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
|
||||
MemBuddyAllocator* allocator = new MemBuddyAllocator(
|
||||
device_, device_->settings().subAllocationChunkSize_,
|
||||
device_->settings().subAllocationMinSize_);
|
||||
if (!((allocator != nullptr) &&
|
||||
(allocator->Init() == Pal::Result::Success) &&
|
||||
MemBuddyAllocator* allocator =
|
||||
new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_,
|
||||
device_->settings().subAllocationMinSize_);
|
||||
if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) &&
|
||||
heaps_.insert({mem_ref, allocator}).second)) {
|
||||
mem_ref->release();
|
||||
delete allocator;
|
||||
@@ -1890,8 +1866,7 @@ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
MemorySubAllocator::~MemorySubAllocator()
|
||||
{
|
||||
MemorySubAllocator::~MemorySubAllocator() {
|
||||
// Release memory heap for suballocations
|
||||
for (const auto& it : heaps_) {
|
||||
it.first->release();
|
||||
@@ -1901,8 +1876,8 @@ MemorySubAllocator::~MemorySubAllocator()
|
||||
|
||||
// ================================================================================================
|
||||
GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
|
||||
{
|
||||
const Pal::IGpuMemory* reserved_va,
|
||||
Pal::gpusize* offset) {
|
||||
GpuMemoryReference* mem_ref = nullptr;
|
||||
MemBuddyAllocator* allocator = nullptr;
|
||||
// Check if the resource size and alignment are allowed for suballocation
|
||||
@@ -1927,7 +1902,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
|
||||
}
|
||||
// We didn't find a valid chunk, so create a new one
|
||||
if (!CreateChunk(reserved_va)) {
|
||||
return nullptr;
|
||||
return nullptr;
|
||||
}
|
||||
i++;
|
||||
} while (i < 2);
|
||||
@@ -1936,8 +1911,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
|
||||
{
|
||||
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) {
|
||||
bool release_mem = false;
|
||||
{
|
||||
amd::ScopedLock l(monitor);
|
||||
@@ -1966,9 +1940,8 @@ ResourceCache::~ResourceCache() { free(); }
|
||||
|
||||
// ================================================================================================
|
||||
//! \note the cache works in FILO mode
|
||||
bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
|
||||
GpuMemoryReference* ref, Pal::gpusize offset)
|
||||
{
|
||||
bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref,
|
||||
Pal::gpusize offset) {
|
||||
bool result = false;
|
||||
size_t size = ref->iMem()->Desc().size;
|
||||
|
||||
@@ -2017,7 +1990,9 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
|
||||
|
||||
// ================================================================================================
|
||||
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
|
||||
Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
|
||||
Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va,
|
||||
Pal::gpusize* offset) {
|
||||
amd::ScopedLock l(&lockCacheOps_);
|
||||
GpuMemoryReference* ref = nullptr;
|
||||
|
||||
@@ -2051,7 +2026,7 @@ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal
|
||||
ref = it.second;
|
||||
cacheSize_ -= sizeRes;
|
||||
if (entry->type_ == Resource::Local) {
|
||||
lclCacheSize_ -= sizeRes;
|
||||
lclCacheSize_ -= sizeRes;
|
||||
}
|
||||
delete it.first;
|
||||
// Remove the found etry from the cache
|
||||
@@ -2078,8 +2053,7 @@ bool ResourceCache::free(size_t minCacheEntries) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void ResourceCache::removeLast()
|
||||
{
|
||||
void ResourceCache::removeLast() {
|
||||
std::pair<Resource::Descriptor*, GpuMemoryReference*> entry;
|
||||
{
|
||||
// Protect access to the global data
|
||||
|
||||
@@ -41,11 +41,11 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
|
||||
//! Get PAL memory object
|
||||
Pal::IGpuMemory* iMem() const { return gpuMem_; }
|
||||
|
||||
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
|
||||
void* cpuAddress_; //!< CPU address of this memory
|
||||
const Device& device_; //!< GPU device
|
||||
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
|
||||
void* cpuAddress_; //!< CPU address of this memory
|
||||
const Device& device_; //!< GPU device
|
||||
//! @note: This field is necessary for the thread safe release only
|
||||
VirtualGPU* gpu_; //!< Resource will be used only on this queue
|
||||
VirtualGPU* gpu_; //!< Resource will be used only on this queue
|
||||
|
||||
protected:
|
||||
//! Default destructor
|
||||
@@ -186,7 +186,7 @@ class Resource : public amd::HeapObject {
|
||||
//! Constructor of 1D Resource object
|
||||
Resource(const Device& gpuDev, //!< GPU device object
|
||||
size_t size //!< Resource size
|
||||
);
|
||||
);
|
||||
|
||||
//! Constructor of Image Resource object
|
||||
Resource(const Device& gpuDev, //!< GPU device object
|
||||
@@ -196,7 +196,7 @@ class Resource : public amd::HeapObject {
|
||||
cl_image_format format, //!< resource format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels = 1 //!< Number of mip levels
|
||||
);
|
||||
);
|
||||
|
||||
//! Destructor of the resource
|
||||
virtual ~Resource();
|
||||
@@ -207,7 +207,7 @@ class Resource : public amd::HeapObject {
|
||||
*/
|
||||
virtual bool create(MemoryType memType, //!< memory type
|
||||
CreateParams* params = 0 //!< special parameters for resource allocation
|
||||
);
|
||||
);
|
||||
|
||||
/*! \brief Copies a subregion of memory from one resource to another
|
||||
*
|
||||
@@ -253,14 +253,13 @@ class Resource : public amd::HeapObject {
|
||||
Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
|
||||
|
||||
//! Returns a pointer to the memory reference
|
||||
GpuMemoryReference* memRef() const {return memRef_; }
|
||||
GpuMemoryReference* memRef() const { return memRef_; }
|
||||
|
||||
//! Returns global memory offset
|
||||
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
|
||||
|
||||
//! Returns global memory offset
|
||||
uint64_t vmSize() const
|
||||
{ return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
|
||||
uint64_t vmSize() const { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
|
||||
|
||||
//! Returns global memory offset
|
||||
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
|
||||
@@ -279,11 +278,11 @@ class Resource : public amd::HeapObject {
|
||||
// Optimization for multilayer map/unmap
|
||||
uint startLayer = 0, //!< Start layer for multilayer map
|
||||
uint numLayers = 0 //!< End layer for multilayer map
|
||||
);
|
||||
);
|
||||
|
||||
//! Unlocks the resource if it was locked
|
||||
void unmap(VirtualGPU* gpu //!< Virtual GPU device object
|
||||
);
|
||||
);
|
||||
|
||||
//! Marks the resource as busy
|
||||
void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
@@ -303,7 +302,7 @@ class Resource : public amd::HeapObject {
|
||||
uint flags = 0, //!< Map flags
|
||||
size_t rowPitch = 0, //!< Raw data row pitch
|
||||
size_t slicePitch = 0 //!< Raw data slice pitch
|
||||
);
|
||||
);
|
||||
|
||||
//! Performs host read from the resource GPU memory
|
||||
bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
@@ -312,7 +311,7 @@ class Resource : public amd::HeapObject {
|
||||
const amd::Coord3D& size, //!< The number of bytes to write
|
||||
size_t rowPitch = 0, //!< Raw data row pitch
|
||||
size_t slicePitch = 0 //!< Raw data slice pitch
|
||||
);
|
||||
);
|
||||
|
||||
//! Gets the resource element size
|
||||
uint elementSize() const { return elementSize_; }
|
||||
@@ -377,7 +376,7 @@ class Resource : public amd::HeapObject {
|
||||
memRef_ = viewOwner_->memRef_;
|
||||
memRef_->retain();
|
||||
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
|
||||
}
|
||||
}
|
||||
@@ -390,33 +389,32 @@ class Resource : public amd::HeapObject {
|
||||
|
||||
protected:
|
||||
/*! \brief Creates a PAL iamge object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully created a PAL resource
|
||||
*/
|
||||
bool CreateImage(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
*
|
||||
* \return True if we succesfully created a PAL resource
|
||||
*/
|
||||
bool CreateImage(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
|
||||
/*! \brief Creates a PAL interop object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully created a PAL interop resource
|
||||
*/
|
||||
bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
*
|
||||
* \return True if we succesfully created a PAL interop resource
|
||||
*/
|
||||
bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
|
||||
/*! \brief Creates a PAL pinned object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully created a PAL pinned resource
|
||||
*/
|
||||
bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
*
|
||||
* \return True if we succesfully created a PAL pinned resource
|
||||
*/
|
||||
bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
|
||||
);
|
||||
|
||||
/*! \brief Creates a PAL SVM object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully created a PAL SVM resource
|
||||
*/
|
||||
*
|
||||
* \return True if we succesfully created a PAL SVM resource
|
||||
*/
|
||||
bool CreateSvm(CreateParams* params, //!< special parameters for resource allocation
|
||||
Pal::gpusize svmPtr
|
||||
);
|
||||
Pal::gpusize svmPtr);
|
||||
|
||||
uint elementSize_; //!< Size of a single element in bytes
|
||||
|
||||
@@ -433,11 +431,11 @@ class Resource : public amd::HeapObject {
|
||||
*/
|
||||
void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
uint flags = 0 //!< flags for the map operation
|
||||
);
|
||||
);
|
||||
|
||||
//! Unlocks the resource with layers if it was locked
|
||||
void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object
|
||||
);
|
||||
);
|
||||
|
||||
//! Calls PAL to map a resource
|
||||
void* gpuMemoryMap(size_t* pitch, //!< Pitch value for the image
|
||||
@@ -454,7 +452,7 @@ class Resource : public amd::HeapObject {
|
||||
|
||||
//! Converts Resource memory type to the PAL heaps
|
||||
void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info
|
||||
);
|
||||
);
|
||||
|
||||
const Device& gpuDevice_; //!< GPU device
|
||||
Descriptor desc_; //!< Descriptor for this resource
|
||||
@@ -462,7 +460,7 @@ class Resource : public amd::HeapObject {
|
||||
void* address_; //!< Physical address of this resource
|
||||
size_t offset_; //!< Resource offset
|
||||
GpuMemoryReference* memRef_; //!< PAL resource reference
|
||||
Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
|
||||
Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
|
||||
const Resource* viewOwner_; //!< GPU resource, which owns this view
|
||||
void* glInteropMbRes_; //!< Mb Res handle
|
||||
uint32_t glType_; //!< GL interop type
|
||||
@@ -485,41 +483,35 @@ class Resource : public amd::HeapObject {
|
||||
typedef Util::BuddyAllocator<Device> MemBuddyAllocator;
|
||||
|
||||
class MemorySubAllocator : public amd::HeapObject {
|
||||
public:
|
||||
public:
|
||||
MemorySubAllocator(Device* device) : device_(device) {}
|
||||
|
||||
~MemorySubAllocator();
|
||||
|
||||
//! Create suballocation
|
||||
GpuMemoryReference* Allocate(Pal::gpusize size,
|
||||
Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va,
|
||||
Pal::gpusize* offset
|
||||
);
|
||||
GpuMemoryReference* Allocate(Pal::gpusize size, Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset);
|
||||
//! Free suballocation
|
||||
bool Free(amd::Monitor* monitor,
|
||||
GpuMemoryReference* mem_ref,
|
||||
Pal::gpusize offset
|
||||
);
|
||||
bool Free(amd::Monitor* monitor, GpuMemoryReference* mem_ref, Pal::gpusize offset);
|
||||
|
||||
protected:
|
||||
protected:
|
||||
//! Allocate new chunk of memory
|
||||
virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
|
||||
bool InitAllocator(GpuMemoryReference* mem_ref);
|
||||
|
||||
Device* device_;
|
||||
std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
|
||||
std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
|
||||
};
|
||||
|
||||
class CoarseMemorySubAllocator : public MemorySubAllocator {
|
||||
public:
|
||||
public:
|
||||
CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
|
||||
|
||||
bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
|
||||
};
|
||||
|
||||
class FineMemorySubAllocator : public MemorySubAllocator {
|
||||
public:
|
||||
public:
|
||||
FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
|
||||
|
||||
bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
|
||||
@@ -529,29 +521,28 @@ class ResourceCache : public amd::HeapObject {
|
||||
public:
|
||||
//! Default constructor
|
||||
ResourceCache(Device* device, size_t cacheSizeLimit)
|
||||
: lockCacheOps_("PAL resource cache", true)
|
||||
, cacheSize_(0)
|
||||
, lclCacheSize_(0)
|
||||
, cacheSizeLimit_(cacheSizeLimit)
|
||||
, mem_sub_alloc_local_(device)
|
||||
, mem_sub_alloc_coarse_ (device)
|
||||
, mem_sub_alloc_fine_ (device) {}
|
||||
: lockCacheOps_("PAL resource cache", true),
|
||||
cacheSize_(0),
|
||||
lclCacheSize_(0),
|
||||
cacheSizeLimit_(cacheSizeLimit),
|
||||
mem_sub_alloc_local_(device),
|
||||
mem_sub_alloc_coarse_(device),
|
||||
mem_sub_alloc_fine_(device) {}
|
||||
|
||||
//! Default destructor
|
||||
~ResourceCache();
|
||||
|
||||
//! Adds a PAL resource to the cache
|
||||
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
|
||||
GpuMemoryReference* ref, //!< Resource reference
|
||||
Pal::gpusize offset //!< Original resource offset
|
||||
);
|
||||
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
|
||||
GpuMemoryReference* ref, //!< Resource reference
|
||||
Pal::gpusize offset //!< Original resource offset
|
||||
);
|
||||
|
||||
//! Finds a PAL resource from the cache
|
||||
GpuMemoryReference* findGpuMemory(
|
||||
Resource::Descriptor* desc, //!< Resource descriptor - cache key
|
||||
Pal::gpusize size,
|
||||
Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
|
||||
Pal::gpusize size, Pal::gpusize alignment,
|
||||
const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
|
||||
Pal::gpusize* offset);
|
||||
|
||||
//! Destroys cache
|
||||
@@ -576,16 +567,17 @@ class ResourceCache : public amd::HeapObject {
|
||||
|
||||
amd::Monitor lockCacheOps_; //!< Lock to serialise cache access
|
||||
|
||||
size_t cacheSize_; //!< Current cache size in bytes
|
||||
size_t lclCacheSize_; //!< Local memory stored in the cache
|
||||
const size_t cacheSizeLimit_; //!< Cache size limit in bytes
|
||||
size_t cacheSize_; //!< Current cache size in bytes
|
||||
size_t lclCacheSize_; //!< Local memory stored in the cache
|
||||
const size_t cacheSizeLimit_; //!< Cache size limit in bytes
|
||||
|
||||
//! PAL resource cache
|
||||
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
|
||||
|
||||
MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
|
||||
CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
|
||||
FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
|
||||
MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
|
||||
CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
|
||||
FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -136,7 +136,7 @@ Settings::Settings() {
|
||||
subAllocationMinSize_ = 4 * Ki;
|
||||
subAllocationChunkSize_ = 64 * Mi;
|
||||
subAllocationMaxSize_ =
|
||||
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
|
||||
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
|
||||
|
||||
maxCmdBuffers_ = 12;
|
||||
useLightning_ = GPU_ENABLE_LC;
|
||||
@@ -148,8 +148,7 @@ Settings::Settings() {
|
||||
|
||||
bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
|
||||
bool reportAsOCL12Device)
|
||||
{
|
||||
bool reportAsOCL12Device) {
|
||||
uint32_t osVer = 0x0;
|
||||
|
||||
// Disable thread trace by default for all devices
|
||||
@@ -198,8 +197,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
case Pal::AsicRevision::Navi10Lite:
|
||||
gfx10Plus_ = true;
|
||||
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
|
||||
hsailExplicitXnack_ = static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
|
||||
|| palProp.gpuMemoryProperties.flags.iommuv2Support);
|
||||
hsailExplicitXnack_ =
|
||||
static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled ||
|
||||
palProp.gpuMemoryProperties.flags.iommuv2Support);
|
||||
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
|
||||
if (useLightning_) {
|
||||
enableWave32Mode_ = true;
|
||||
@@ -346,7 +346,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
|
||||
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
|
||||
if (modifyMaxWorkload.time > 0) {
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
|
||||
}
|
||||
}
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
@@ -39,63 +39,63 @@ class Settings : public device::Settings {
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
|
||||
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
|
||||
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
|
||||
uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
|
||||
uint imageSupport_ : 1; //!< Report images support
|
||||
uint doublePrecision_ : 1; //!< Enables double precision support
|
||||
uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
|
||||
uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
|
||||
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
|
||||
uint viPlus_ : 1; //!< VI and post VI features
|
||||
uint aiPlus_ : 1; //!< AI and post AI features
|
||||
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
|
||||
uint threadTraceEnable_ : 1; //!< Thread trace enable
|
||||
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
|
||||
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
|
||||
uint svmAtomics_ : 1; //!< SVM device atomics
|
||||
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
|
||||
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
|
||||
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
|
||||
uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
|
||||
uint rgpSqttForceDisable_: 1; //!< Disables SQTT
|
||||
uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
|
||||
uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
|
||||
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
|
||||
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
|
||||
uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
|
||||
uint imageSupport_ : 1; //!< Report images support
|
||||
uint doublePrecision_ : 1; //!< Enables double precision support
|
||||
uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
|
||||
uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
|
||||
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
|
||||
uint viPlus_ : 1; //!< VI and post VI features
|
||||
uint aiPlus_ : 1; //!< AI and post AI features
|
||||
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
|
||||
uint threadTraceEnable_ : 1; //!< Thread trace enable
|
||||
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
|
||||
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
|
||||
uint svmAtomics_ : 1; //!< SVM device atomics
|
||||
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
|
||||
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
|
||||
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
|
||||
uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace
|
||||
uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
|
||||
uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
|
||||
uint reserved_ : 11;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
uint oclVersion_; //!< Reported OpenCL version support
|
||||
uint debugFlags_; //!< Debug GPU flags
|
||||
uint hwLDSSize_; //!< HW local data store size
|
||||
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
|
||||
uint preferredWorkGroupSize_;//!< Requested preferred workgroup size for this device
|
||||
uint workloadSplitSize_; //!< Workload split size
|
||||
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
|
||||
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
|
||||
uint blitEngine_; //!< Blit engine type
|
||||
uint cacheLineSize_; //!< Cache line size in bytes
|
||||
uint cacheSize_; //!< L1 cache size in bytes
|
||||
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
|
||||
uint numDeviceEvents_; //!< The number of device events
|
||||
uint numWaitEvents_; //!< The number of wait events for device enqueue
|
||||
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
|
||||
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
size_t resourceCacheSize_; //!< Resource cache size in MB
|
||||
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
|
||||
uint64_t maxAllocSize_; //!< Maximum single allocation size
|
||||
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
|
||||
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
|
||||
uint oclVersion_; //!< Reported OpenCL version support
|
||||
uint debugFlags_; //!< Debug GPU flags
|
||||
uint hwLDSSize_; //!< HW local data store size
|
||||
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
|
||||
uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device
|
||||
uint workloadSplitSize_; //!< Workload split size
|
||||
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
|
||||
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
|
||||
uint blitEngine_; //!< Blit engine type
|
||||
uint cacheLineSize_; //!< Cache line size in bytes
|
||||
uint cacheSize_; //!< L1 cache size in bytes
|
||||
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
|
||||
uint numDeviceEvents_; //!< The number of device events
|
||||
uint numWaitEvents_; //!< The number of wait events for device enqueue
|
||||
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
|
||||
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
size_t resourceCacheSize_; //!< Resource cache size in MB
|
||||
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
|
||||
uint64_t maxAllocSize_; //!< Maximum single allocation size
|
||||
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
|
||||
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
|
||||
|
||||
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
|
||||
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
|
||||
uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
|
||||
|
||||
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
|
||||
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
|
||||
uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
|
||||
|
||||
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
|
||||
|
||||
//! Default constructor
|
||||
@@ -106,7 +106,7 @@ class Settings : public device::Settings {
|
||||
const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings
|
||||
const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings
|
||||
bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device
|
||||
);
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
@@ -119,4 +119,5 @@ class Settings : public device::Settings {
|
||||
void override();
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -40,7 +40,7 @@ class TimeStamp : public amd::HeapObject {
|
||||
Pal::IGpuMemory* iMem, //!< Buffer with the timer values
|
||||
uint memOffset, //!< Offset in the buffer for the current TS
|
||||
address cpuAddr //!< CPU pointer for the values in memory
|
||||
);
|
||||
);
|
||||
|
||||
//! Default destructor
|
||||
~TimeStamp();
|
||||
@@ -114,4 +114,5 @@ class TimeStampCache : public amd::HeapObject {
|
||||
uint tsOffset_; //!< Active offset in the current mem object
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
@@ -70,8 +70,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
|
||||
if (qCreateInfo.engineType == Pal::EngineTypeExclusiveCompute) {
|
||||
if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
|
||||
qCreateInfo.engineIndex = it->second;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -97,8 +96,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
|
||||
}
|
||||
|
||||
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
|
||||
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
|
||||
residency_limit, max_command_buffers);
|
||||
VirtualGPU::Queue* queue =
|
||||
new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
|
||||
if (queue != nullptr) {
|
||||
address addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
// Create PAL queue object
|
||||
@@ -163,16 +162,16 @@ VirtualGPU::Queue::~Queue() {
|
||||
}
|
||||
}
|
||||
|
||||
Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile()
|
||||
{
|
||||
std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
|
||||
Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
|
||||
std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
|
||||
|
||||
const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
|
||||
// Find the last occurance of the '\\' character and extract the name of the application as wide char.
|
||||
const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
|
||||
const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
|
||||
const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
|
||||
// Find the last occurance of the '\\' character and extract the name of the application as wide
|
||||
// char.
|
||||
const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
|
||||
const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
|
||||
|
||||
return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
|
||||
return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
|
||||
}
|
||||
|
||||
void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
|
||||
@@ -188,8 +187,7 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
|
||||
memRef.pGpuMemory = iMem;
|
||||
palMemRefs_.push_back(memRef);
|
||||
// Check SDI memory object
|
||||
if (iMem->Desc().flags.isExternPhys &&
|
||||
(sdiReferences_.find(iMem) == sdiReferences_.end())) {
|
||||
if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
|
||||
sdiReferences_.insert(iMem);
|
||||
palSdiRefs_.push_back(iMem);
|
||||
}
|
||||
@@ -268,8 +266,7 @@ bool VirtualGPU::Queue::flush() {
|
||||
// Submit command buffer to OS
|
||||
Pal::Result result;
|
||||
if (gpu_.rgpCaptureEna()) {
|
||||
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
|
||||
iQueue_, cmdBufIdCurrent_, submitInfo);
|
||||
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
|
||||
} else {
|
||||
result = iQueue_->Submit(submitInfo);
|
||||
}
|
||||
@@ -383,28 +380,28 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
|
||||
if (dump.is_open()) {
|
||||
dump << start << " Queue: ";
|
||||
switch (iQueue_->Type()) {
|
||||
case Pal::QueueTypeCompute:
|
||||
dump << "Compute";
|
||||
break;
|
||||
case Pal::QueueTypeDma:
|
||||
dump << "SDMA";
|
||||
break;
|
||||
default:
|
||||
dump << "unknown";
|
||||
break;
|
||||
case Pal::QueueTypeCompute:
|
||||
dump << "Compute";
|
||||
break;
|
||||
case Pal::QueueTypeDma:
|
||||
dump << "SDMA";
|
||||
break;
|
||||
default:
|
||||
dump << "unknown";
|
||||
break;
|
||||
}
|
||||
dump << "\n"
|
||||
<< "Resident memory resources:\n";
|
||||
<< "Resident memory resources:\n";
|
||||
uint idx = 0;
|
||||
for (auto it : memReferences_) {
|
||||
dump << " " << idx << "\t[";
|
||||
dump.setf(std::ios::hex, std::ios::basefield);
|
||||
dump.setf(std::ios::showbase);
|
||||
dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
|
||||
<< (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
|
||||
<< (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
|
||||
dump.setf(std::ios::dec);
|
||||
dump << "] CbId:" << it.second <<
|
||||
", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n";
|
||||
dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap
|
||||
<< "\n";
|
||||
idx++;
|
||||
}
|
||||
|
||||
@@ -414,8 +411,7 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
||||
// Find if the current argument is a memory object
|
||||
if ((desc.type_ == T_POINTER) &&
|
||||
(desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
|
||||
if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
|
||||
dump << " " << desc.name_ << ": " << std::endl;
|
||||
}
|
||||
}
|
||||
@@ -519,7 +515,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
// note: The array growth shouldn't occur under the normal conditions,
|
||||
// but in a case when SVM path sends the amount of SVM ptrs over
|
||||
// the max size of kernel arguments
|
||||
MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
|
||||
MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
|
||||
if (nullptr == ptr) {
|
||||
numMemObjectsInQueue_ = 0;
|
||||
return;
|
||||
@@ -527,7 +523,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
maxMemObjectsInQueue_ <<= 1;
|
||||
memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
|
||||
delete[] memObjectsInQueue_;
|
||||
memObjectsInQueue_= ptr;
|
||||
memObjectsInQueue_ = ptr;
|
||||
}
|
||||
|
||||
// Adjust the number of active objects
|
||||
@@ -748,7 +744,6 @@ VirtualGPU::VirtualGPU(Device& device)
|
||||
maskGroups_(1),
|
||||
hsaQueueMem_(nullptr),
|
||||
cmdAllocator_(nullptr) {
|
||||
|
||||
// Note: Virtual GPU device creation must be a thread safe operation
|
||||
index_ = gpuDevice_.numOfVgpus_++;
|
||||
gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
|
||||
@@ -780,8 +775,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
createInfo.flags.autoMemoryReuse = false;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
|
||||
VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
|
||||
VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
|
||||
|
||||
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
|
||||
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
|
||||
@@ -803,8 +798,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
|
||||
const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0;
|
||||
uint idx = index() % (dev().numComputeEngines() - firstQueue);
|
||||
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
|
||||
(dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
|
||||
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
|
||||
? 0
|
||||
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
|
||||
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
|
||||
|
||||
if (dev().numComputeEngines()) {
|
||||
@@ -815,9 +811,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
// hwRing_ should be set 0 if forced to have single scratch buffer
|
||||
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
|
||||
|
||||
queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
|
||||
cmdAllocator_, rtCUs, priority,
|
||||
residency_limit, max_cmd_buffers);
|
||||
queues_[MainEngine] =
|
||||
Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs,
|
||||
priority, residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[MainEngine]) {
|
||||
return false;
|
||||
}
|
||||
@@ -832,20 +828,19 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
sdma = 1;
|
||||
}
|
||||
|
||||
queues_[SdmaEngine] =
|
||||
Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
|
||||
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit, max_cmd_buffers);
|
||||
queues_[SdmaEngine] = Queue::Create(
|
||||
*this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
|
||||
amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
|
||||
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
queues_[SdmaEngine] =
|
||||
Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
|
||||
amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Unimplemented();
|
||||
@@ -921,7 +916,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
bool dbg_vmid = false;
|
||||
state_.rgpCaptureEnabled_ = true;
|
||||
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
|
||||
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
|
||||
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_,
|
||||
&dbg_vmid);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1511,99 +1507,99 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
|
||||
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
|
||||
bool unmapMip = false;
|
||||
amd::Image* amdImage;
|
||||
{
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
{
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
|
||||
amd::Memory* owner = memory->owner();
|
||||
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
|
||||
if (nullptr == writeMapInfo) {
|
||||
LogError("Unmap without map call");
|
||||
return;
|
||||
}
|
||||
profilingBegin(vcmd, true);
|
||||
|
||||
// Check if image is a mipmap and assign a saved view
|
||||
amdImage = owner->asImage();
|
||||
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
|
||||
(writeMapInfo->baseMip_ != nullptr)) {
|
||||
// Assign mip level view
|
||||
amdImage = writeMapInfo->baseMip_;
|
||||
// Clear unmap flags from the parent image
|
||||
memory->clearUnmapInfo(vcmd.mapPtr());
|
||||
memory = dev().getGpuMemory(amdImage);
|
||||
unmapMip = true;
|
||||
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
|
||||
}
|
||||
|
||||
// We used host memory
|
||||
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
|
||||
if (writeMapInfo->isUnmapWrite()) {
|
||||
// Target is the backing store, so sync
|
||||
owner->signalWrite(nullptr);
|
||||
memory->syncCacheFromHost(*this);
|
||||
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
|
||||
amd::Memory* owner = memory->owner();
|
||||
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
|
||||
if (nullptr == writeMapInfo) {
|
||||
LogError("Unmap without map call");
|
||||
return;
|
||||
}
|
||||
// Remove memory from VA cache
|
||||
dev().removeVACache(memory);
|
||||
}
|
||||
// data check was added for persistent memory that failed to get aperture
|
||||
// and therefore are treated like a remote resource
|
||||
else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
|
||||
memory->unmap(this);
|
||||
} else if (memory->mapMemory() != nullptr) {
|
||||
if (writeMapInfo->isUnmapWrite()) {
|
||||
amd::Coord3D srcOrigin(0, 0, 0);
|
||||
// Target is a remote resource, so copy
|
||||
assert(memory->mapMemory() != nullptr);
|
||||
if (memory->desc().buffer_) {
|
||||
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
|
||||
writeMapInfo->origin_, writeMapInfo->region_,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
}
|
||||
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
|
||||
Memory* memoryBuf = memory;
|
||||
amd::Coord3D origin(writeMapInfo->origin_[0]);
|
||||
amd::Coord3D size(writeMapInfo->region_[0]);
|
||||
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
|
||||
origin.c[0] *= elemSize;
|
||||
size.c[0] *= elemSize;
|
||||
profilingBegin(vcmd, true);
|
||||
|
||||
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
|
||||
if (nullptr == bufferFromImage) {
|
||||
LogError("We should not fail buffer creation from image_buffer!");
|
||||
// Check if image is a mipmap and assign a saved view
|
||||
amdImage = owner->asImage();
|
||||
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
|
||||
(writeMapInfo->baseMip_ != nullptr)) {
|
||||
// Assign mip level view
|
||||
amdImage = writeMapInfo->baseMip_;
|
||||
// Clear unmap flags from the parent image
|
||||
memory->clearUnmapInfo(vcmd.mapPtr());
|
||||
memory = dev().getGpuMemory(amdImage);
|
||||
unmapMip = true;
|
||||
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
|
||||
}
|
||||
|
||||
// We used host memory
|
||||
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
|
||||
if (writeMapInfo->isUnmapWrite()) {
|
||||
// Target is the backing store, so sync
|
||||
owner->signalWrite(nullptr);
|
||||
memory->syncCacheFromHost(*this);
|
||||
}
|
||||
// Remove memory from VA cache
|
||||
dev().removeVACache(memory);
|
||||
}
|
||||
// data check was added for persistent memory that failed to get aperture
|
||||
// and therefore are treated like a remote resource
|
||||
else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
|
||||
memory->unmap(this);
|
||||
} else if (memory->mapMemory() != nullptr) {
|
||||
if (writeMapInfo->isUnmapWrite()) {
|
||||
amd::Coord3D srcOrigin(0, 0, 0);
|
||||
// Target is a remote resource, so copy
|
||||
assert(memory->mapMemory() != nullptr);
|
||||
if (memory->desc().buffer_) {
|
||||
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
|
||||
writeMapInfo->origin_, writeMapInfo->region_,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
}
|
||||
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
|
||||
Memory* memoryBuf = memory;
|
||||
amd::Coord3D origin(writeMapInfo->origin_[0]);
|
||||
amd::Coord3D size(writeMapInfo->region_[0]);
|
||||
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
|
||||
origin.c[0] *= elemSize;
|
||||
size.c[0] *= elemSize;
|
||||
|
||||
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
|
||||
if (nullptr == bufferFromImage) {
|
||||
LogError("We should not fail buffer creation from image_buffer!");
|
||||
} else {
|
||||
memoryBuf = dev().getGpuMemory(bufferFromImage);
|
||||
}
|
||||
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
}
|
||||
if (nullptr != bufferFromImage) {
|
||||
bufferFromImage->release();
|
||||
}
|
||||
} else {
|
||||
memoryBuf = dev().getGpuMemory(bufferFromImage);
|
||||
}
|
||||
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
}
|
||||
if (nullptr != bufferFromImage) {
|
||||
bufferFromImage->release();
|
||||
}
|
||||
} else {
|
||||
if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
|
||||
writeMapInfo->origin_, writeMapInfo->region_,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
|
||||
writeMapInfo->origin_, writeMapInfo->region_,
|
||||
writeMapInfo->isEntire())) {
|
||||
LogError("submitUnmapMemory() - copy failed");
|
||||
vcmd.setStatus(CL_OUT_OF_RESOURCES);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LogError("Unhandled unmap!");
|
||||
vcmd.setStatus(CL_INVALID_VALUE);
|
||||
}
|
||||
} else {
|
||||
LogError("Unhandled unmap!");
|
||||
vcmd.setStatus(CL_INVALID_VALUE);
|
||||
|
||||
// Clear unmap flags
|
||||
memory->clearUnmapInfo(vcmd.mapPtr());
|
||||
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
// Clear unmap flags
|
||||
memory->clearUnmapInfo(vcmd.mapPtr());
|
||||
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
// Release a view for a mipmap map
|
||||
if (unmapMip) {
|
||||
// Memory release should be outside of the execution lock,
|
||||
@@ -1700,9 +1696,9 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
profilingBegin(cmd);
|
||||
|
||||
Memory* srcDevMem = static_cast<pal::Memory*>(
|
||||
cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
Memory* dstDevMem = static_cast<pal::Memory*>(
|
||||
cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
|
||||
bool p2pAllowed = false;
|
||||
#if 0
|
||||
@@ -1728,16 +1724,15 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
|
||||
|
||||
if (p2pAllowed) {
|
||||
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin,
|
||||
size, cmd.isEntireMemory());
|
||||
}
|
||||
else {
|
||||
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
|
||||
cmd.isEntireMemory());
|
||||
} else {
|
||||
amd::ScopedLock lock(dev().P2PStageOps());
|
||||
Memory* dstStgMem = static_cast<pal::Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
Memory* srcStgMem = static_cast<pal::Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
|
||||
size_t copy_size = Device::kP2PStagingSize;
|
||||
size_t left_size = size[0];
|
||||
amd::Coord3D stageOffset(0);
|
||||
@@ -1750,11 +1745,11 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
amd::Coord3D cpSize(copy_size);
|
||||
|
||||
// Perform 2 step transfer with staging buffer
|
||||
result &= dev().xferMgr().copyBuffer(
|
||||
*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
|
||||
result &=
|
||||
dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
|
||||
srcOrigin.c[0] += copy_size;
|
||||
result &= dstDevMem->dev().xferMgr().copyBuffer(
|
||||
*srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize);
|
||||
result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
|
||||
dstOrigin, cpSize);
|
||||
dstOrigin.c[0] += copy_size;
|
||||
} while (left_size > 0);
|
||||
}
|
||||
@@ -1940,10 +1935,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue)
|
||||
{
|
||||
AmdAqlWrap* wraps =
|
||||
(AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
|
||||
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) {
|
||||
AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
|
||||
uint p = 0;
|
||||
for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
|
||||
if (wraps[i].state != 0) {
|
||||
@@ -1963,11 +1956,9 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
|
||||
print << "\twait_list: " << wraps[i].wait_list << "\n";
|
||||
print << "\twait_num: " << wraps[i].wait_num << "\n";
|
||||
uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
|
||||
size_t* events =
|
||||
reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
|
||||
size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
|
||||
for (j = 0; j < wraps[i].wait_num; ++j) {
|
||||
uint offs =
|
||||
static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
|
||||
uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
|
||||
AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
|
||||
print << "Wait Event#: " << j << "\n";
|
||||
print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
|
||||
@@ -1980,8 +1971,8 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
|
||||
print << wraps[i].aql.grid_size_z << "]\n";
|
||||
|
||||
HSAILKernel* child = nullptr;
|
||||
for (auto it = hsaKernel.prog().kernels().begin();
|
||||
it != hsaKernel.prog().kernels().end(); ++it) {
|
||||
for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
|
||||
++it) {
|
||||
if (wraps[i].aql.kernel_object == static_cast<HSAILKernel*>(it->second)->gpuAqlCode()) {
|
||||
child = static_cast<HSAILKernel*>(it->second);
|
||||
}
|
||||
@@ -1995,7 +1986,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
|
||||
uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
|
||||
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
|
||||
print << "Kernel: " << child->name() << "\n";
|
||||
const amd::KernelSignature& signature = child->signature();
|
||||
const amd::KernelSignature& signature = child->signature();
|
||||
|
||||
// Check if runtime has to setup hidden arguments
|
||||
for (const auto it : signature.parameters()) {
|
||||
@@ -2033,7 +2024,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
|
||||
continue;
|
||||
}
|
||||
print << "\t" << it.name_ << ": ";
|
||||
for (int s = it.size_- 1; s >= 0; --s) {
|
||||
for (int s = it.size_ - 1; s >= 0; --s) {
|
||||
print.width(2);
|
||||
print.fill('0');
|
||||
print << static_cast<uint32_t>(argum[s]);
|
||||
@@ -2047,26 +2038,20 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::PreDeviceEnqueue(
|
||||
const amd::Kernel& kernel,
|
||||
const HSAILKernel& hsaKernel,
|
||||
VirtualGPU** gpuDefQueue,
|
||||
uint64_t* vmDefQueue)
|
||||
{
|
||||
bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
|
||||
VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
|
||||
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
|
||||
if (nullptr == defQueue) {
|
||||
LogError("Default device queue wasn't allocated");
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
*gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
|
||||
if ((*gpuDefQueue)->hwRing() == hwRing()) {
|
||||
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
createVirtualQueue(defQueue->size());
|
||||
*gpuDefQueue = this;
|
||||
}
|
||||
@@ -2086,15 +2071,10 @@ bool VirtualGPU::PreDeviceEnqueue(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::PostDeviceEnqueue(
|
||||
const amd::Kernel& kernel,
|
||||
const HSAILKernel& hsaKernel,
|
||||
VirtualGPU* gpuDefQueue,
|
||||
uint64_t vmDefQueue,
|
||||
uint64_t vmParentWrap,
|
||||
GpuEvent* gpuEvent)
|
||||
{
|
||||
uint32_t id = gpuEvent->id_;
|
||||
void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
|
||||
VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
|
||||
uint64_t vmParentWrap, GpuEvent* gpuEvent) {
|
||||
uint32_t id = gpuEvent->id_;
|
||||
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
|
||||
|
||||
// Make sure exculsive access to the device queue
|
||||
@@ -2110,16 +2090,16 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
|
||||
dev().settings().useDeviceQueue_);
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
|
||||
dev().settings().useDeviceQueue_);
|
||||
eventEnd(MainEngine, *gpuEvent);
|
||||
}
|
||||
|
||||
// Get the global loop start before the scheduler
|
||||
Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
|
||||
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
const static bool FlushL2 = true;
|
||||
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
|
||||
|
||||
@@ -2127,8 +2107,7 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
//! @note DMA flush must not occur between patch and the scheduler
|
||||
Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
|
||||
// Program parameters for the scheduler
|
||||
SchedulerParam* param = reinterpret_cast<SchedulerParam*>(
|
||||
gpuDefQueue->schedParams_->data());
|
||||
SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
|
||||
param->signal = 1;
|
||||
// Scale clock to 1024 to avoid 64 bit div in the scheduler
|
||||
param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
|
||||
@@ -2147,8 +2126,7 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
|
||||
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
|
||||
addVmMemory(scratchBuf);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
param->numMaxWaves = 0;
|
||||
param->scratchSize = 0;
|
||||
param->scratch = 0;
|
||||
@@ -2162,8 +2140,8 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
|
||||
gpuDefQueue->eventBegin(MainEngine);
|
||||
gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
|
||||
signalAddr, loopStart,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
signalAddr, loopStart,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
|
||||
// Thus TS command for profiling has to follow in the next CB.
|
||||
constexpr bool ForceSubmitFirst = true;
|
||||
@@ -2173,10 +2151,10 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
// Add the termination handshake to the host queue
|
||||
eventBegin(MainEngine);
|
||||
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
signalAddr, dev().settings().useDeviceQueue_);
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
|
||||
dev().settings().useDeviceQueue_);
|
||||
if (id != gpuEvent->id_) {
|
||||
LogError("Something is wrong. ID mismatch!\n");
|
||||
LogError("Something is wrong. ID mismatch!\n");
|
||||
}
|
||||
eventEnd(MainEngine, *gpuEvent);
|
||||
}
|
||||
@@ -2193,7 +2171,8 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
profilingBegin(vcmd);
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
|
||||
vcmd.sharedMemBytes())) {
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -2203,10 +2182,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, bool nativeMem,
|
||||
amd::Event* enqueueEvent, uint32_t sharedMemBytes)
|
||||
{
|
||||
size_t newOffset[3] = { 0, 0, 0 };
|
||||
size_t newGlobalSize[3] = { 0, 0, 0 };
|
||||
amd::Event* enqueueEvent, uint32_t sharedMemBytes) {
|
||||
size_t newOffset[3] = {0, 0, 0};
|
||||
size_t newGlobalSize[3] = {0, 0, 0};
|
||||
|
||||
int dim = -1;
|
||||
int iteration = 1;
|
||||
@@ -2221,17 +2199,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
|
||||
// If RGP capturing is enabled, then start SQTT trace
|
||||
if (rgpCaptureEna()) {
|
||||
size_t newLocalSize[3] = { 1, 1, 1 };
|
||||
size_t newLocalSize[3] = {1, 1, 1};
|
||||
for (uint i = 0; i < sizes.dimensions(); i++) {
|
||||
if (sizes.local()[i] != 0) {
|
||||
newLocalSize[i] = sizes.local()[i];
|
||||
}
|
||||
}
|
||||
dev().rgpCaptureMgr()->PreDispatch(this, hsaKernel,
|
||||
// Report global size in workgroups, since that's the RGP trace semantics
|
||||
newGlobalSize[0] / newLocalSize[0],
|
||||
newGlobalSize[1] / newLocalSize[1],
|
||||
newGlobalSize[2] / newLocalSize[2]);
|
||||
dev().rgpCaptureMgr()->PreDispatch(
|
||||
this, hsaKernel,
|
||||
// Report global size in workgroups, since that's the RGP trace semantics
|
||||
newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
|
||||
newGlobalSize[2] / newLocalSize[2]);
|
||||
}
|
||||
|
||||
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
|
||||
@@ -2257,8 +2235,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
|
||||
// Check memory dependency and SVM objects
|
||||
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
}
|
||||
bool needFlush = false;
|
||||
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
|
||||
@@ -2305,15 +2283,14 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
// an extra loop is required.
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
amd::Memory* const* memories =
|
||||
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
|
||||
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
|
||||
for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
|
||||
if (nativeMem) {
|
||||
Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
|
||||
if (gpuMem != nullptr) {
|
||||
gpuMem->setBusy(*this, gpuEvent);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
amd::Memory* mem = memories[i];
|
||||
if (mem != nullptr) {
|
||||
dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
|
||||
@@ -2325,7 +2302,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
uint64_t vmParentWrap = 0;
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
|
||||
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
|
||||
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
|
||||
if (nullptr == aqlPkt) {
|
||||
LogError("Couldn't load kernel arguments");
|
||||
return false;
|
||||
@@ -2348,8 +2325,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
|
||||
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
|
||||
enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
@@ -2660,7 +2636,6 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
|
||||
} else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
|
||||
|
||||
EngineType activeEngineID = engineID_;
|
||||
engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);
|
||||
|
||||
@@ -2669,8 +2644,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
|
||||
addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
|
||||
// Workarounds: We had systems where an extra delay was necessary.
|
||||
{
|
||||
// Flush CB associated with the DGMA buffer
|
||||
isDone(pGpuMemory->getGpuEvent(*this));
|
||||
// Flush CB associated with the DGMA buffer
|
||||
isDone(pGpuMemory->getGpuEvent(*this));
|
||||
}
|
||||
|
||||
eventBegin(engineID_);
|
||||
@@ -2711,10 +2686,11 @@ void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd
|
||||
pGpuMems[i] = pGpuMemory->iMem();
|
||||
}
|
||||
|
||||
dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim);
|
||||
dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
|
||||
Pal::GpuMemoryRefCantTrim);
|
||||
dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
|
||||
if (numObjects != 0) {
|
||||
dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
|
||||
dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
|
||||
}
|
||||
|
||||
for (uint i = 0; i < numObjects; i++) {
|
||||
@@ -3104,8 +3080,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
break;
|
||||
}
|
||||
// get svm non arugment information
|
||||
void* const* svmPtrArray = reinterpret_cast<void* const*>(
|
||||
params + kernelParams.getExecInfoOffset());
|
||||
void* const* svmPtrArray =
|
||||
reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
|
||||
if (nullptr == memory) {
|
||||
@@ -3149,8 +3125,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
bool srdResource = false;
|
||||
amd::Memory* const* memories =
|
||||
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
|
||||
const HSAILKernel& hsaKernel =
|
||||
static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
|
||||
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
ldsAddress = hsaKernel.ldsSize();
|
||||
|
||||
@@ -3225,10 +3200,10 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
addVmMemory(gpuMem);
|
||||
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
|
||||
LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
|
||||
desc.typeName_.c_str(), desc.name_.c_str(),
|
||||
globalAddress, reinterpret_cast<void*>(gpuMem->vmAddress()),
|
||||
reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
|
||||
std::this_thread::get_id());
|
||||
desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
|
||||
reinterpret_cast<void*>(gpuMem->vmAddress()),
|
||||
reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
|
||||
std::this_thread::get_id());
|
||||
|
||||
//! Check if compiler expects read/write.
|
||||
//! Note: SVM with subbuffers has an issue with tracking.
|
||||
@@ -3255,30 +3230,28 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
}
|
||||
if (gpuMem->desc().isDoppTexture_) {
|
||||
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
|
||||
kernel.parameters().getExecPfpaVcop());
|
||||
kernel.parameters().getExecPfpaVcop());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_VOID) {
|
||||
} else if (desc.type_ == T_VOID) {
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
|
||||
// Copy the current structure into CB1
|
||||
size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(
|
||||
params + desc.offset_, desc.size_));
|
||||
size_t gpuPtr =
|
||||
static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
const auto it = hsaKernel.patch().find(desc.offset_);
|
||||
// Patch the GPU VA address in the original arguments
|
||||
WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
|
||||
addVmMemory(cb(1)->ActiveMemory());
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_SAMPLER) {
|
||||
} else if (desc.type_ == T_SAMPLER) {
|
||||
srdResource = true;
|
||||
} else if (desc.type_ == T_QUEUE) {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
||||
params + kernelParams.queueObjOffset())[index];
|
||||
const amd::DeviceQueue* queue =
|
||||
reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
|
||||
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
||||
uint64_t vmQueue;
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
|
||||
@@ -51,17 +51,18 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Queue(const Queue&) = delete;
|
||||
Queue& operator=(const Queue&) = delete;
|
||||
|
||||
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
|
||||
Pal::QueueType queueType, //!< PAL queue type
|
||||
uint engineIdx, //!< Select particular engine index
|
||||
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
||||
uint rtCU, //!< The number of reserved CUs
|
||||
amd::CommandQueue::Priority priority, //!< Queue priority
|
||||
uint64_t residency_limit, //!< Enables residency limit
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
|
||||
Pal::QueueType queueType, //!< PAL queue type
|
||||
uint engineIdx, //!< Select particular engine index
|
||||
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
||||
uint rtCU, //!< The number of reserved CUs
|
||||
amd::CommandQueue::Priority priority, //!< Queue priority
|
||||
uint64_t residency_limit, //!< Enables residency limit
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
|
||||
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
|
||||
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
|
||||
uint max_command_buffers)
|
||||
: iQueue_(nullptr),
|
||||
iCmdBuffs_(max_command_buffers, nullptr),
|
||||
iCmdFences_(max_command_buffers, nullptr),
|
||||
@@ -75,8 +76,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
vlAlloc_(64 * Ki),
|
||||
residency_size_(0),
|
||||
residency_limit_(residency_limit),
|
||||
max_command_buffers_(max_command_buffers)
|
||||
{
|
||||
max_command_buffers_(max_command_buffers) {
|
||||
vlAlloc_.Init();
|
||||
}
|
||||
|
||||
@@ -100,8 +100,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Pal::Result UpdateAppPowerProfile();
|
||||
|
||||
// ibReuse forces event wait without polling, to make sure event occured
|
||||
template <bool ibReuse>
|
||||
bool waifForFence(uint cbId) const {
|
||||
template <bool ibReuse> bool waifForFence(uint cbId) const {
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
@@ -138,8 +137,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
//! Flushes the current command buffer to HW
|
||||
//! Returns ID associated with the submission
|
||||
template <bool avoidBarrierSubmit = false>
|
||||
uint submit(bool forceFlush);
|
||||
template <bool avoidBarrierSubmit = false> uint submit(bool forceFlush);
|
||||
|
||||
bool flush();
|
||||
|
||||
@@ -151,28 +149,28 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
uint cmdBufId() const { return cmdBufIdCurrent_; }
|
||||
|
||||
Pal::IQueue* iQueue_; //!< PAL queue object
|
||||
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
|
||||
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
|
||||
const amd::Kernel* last_kernel_; //!< Last submitted kernel
|
||||
Pal::IQueue* iQueue_; //!< PAL queue object
|
||||
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
|
||||
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
|
||||
const amd::Kernel* last_kernel_; //!< Last submitted kernel
|
||||
|
||||
private:
|
||||
private:
|
||||
void DumpMemoryReferences() const;
|
||||
const VirtualGPU& gpu_; //!< OCL virtual GPU object
|
||||
Pal::IDevice* iDev_; //!< PAL device
|
||||
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
||||
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
||||
uint cmbBufIdRetired_; //!< The last retired command buffer ID
|
||||
uint cmdCnt_; //!< Counter of commands
|
||||
const VirtualGPU& gpu_; //!< OCL virtual GPU object
|
||||
Pal::IDevice* iDev_; //!< PAL device
|
||||
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
||||
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
||||
uint cmbBufIdRetired_; //!< The last retired command buffer ID
|
||||
uint cmdCnt_; //!< Counter of commands
|
||||
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
|
||||
Util::VirtualLinearAllocator vlAlloc_;
|
||||
std::vector<Pal::GpuMemoryRef> palMemRefs_;
|
||||
std::vector<Pal::IGpuMemory*> palMems_;
|
||||
std::vector<Pal::DoppRef> palDoppRefs_;
|
||||
std::set<Pal::IGpuMemory*> sdiReferences_;
|
||||
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
|
||||
uint64_t residency_size_; //!< Resource residency size
|
||||
uint64_t residency_limit_; //!< Enables residency limit
|
||||
Util::VirtualLinearAllocator vlAlloc_;
|
||||
std::vector<Pal::GpuMemoryRef> palMemRefs_;
|
||||
std::vector<Pal::IGpuMemory*> palMems_;
|
||||
std::vector<Pal::DoppRef> palDoppRefs_;
|
||||
std::set<Pal::IGpuMemory*> sdiReferences_;
|
||||
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
|
||||
uint64_t residency_size_; //!< Resource residency size
|
||||
uint64_t residency_limit_; //!< Enables residency limit
|
||||
uint max_command_buffers_;
|
||||
};
|
||||
|
||||
@@ -185,14 +183,14 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
CommandBatch(amd::Command* head, //!< Command batch head
|
||||
const GpuEvent* events, //!< HW events on all engines
|
||||
TimeStamp* lastTS //!< Last TS in command batch
|
||||
) {
|
||||
) {
|
||||
init(head, events, lastTS);
|
||||
}
|
||||
|
||||
void init(amd::Command* head, //!< Command batch head
|
||||
const GpuEvent* events, //!< HW events on all engines
|
||||
TimeStamp* lastTS //!< Last TS in command batch
|
||||
) {
|
||||
) {
|
||||
head_ = head;
|
||||
lastTS_ = lastTS;
|
||||
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
|
||||
@@ -202,11 +200,11 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! The virtual GPU states
|
||||
union State {
|
||||
struct {
|
||||
uint profiling_ : 1; //!< Profiling is enabled
|
||||
uint forceWait_ : 1; //!< Forces wait in flush()
|
||||
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
|
||||
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
|
||||
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
|
||||
uint profiling_ : 1; //!< Profiling is enabled
|
||||
uint forceWait_ : 1; //!< Forces wait in flush()
|
||||
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
|
||||
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
|
||||
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
|
||||
};
|
||||
uint value_;
|
||||
State() : value_(0) {}
|
||||
@@ -259,13 +257,13 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void findSplitSize(const Device& dev, //!< GPU device object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
);
|
||||
|
||||
// Returns TRUE if DMA command buffer is ready for a flush
|
||||
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
);
|
||||
|
||||
// Returns dispatch split size
|
||||
uint dispatchSplitSize() const { return dispatchSplitSize_; }
|
||||
@@ -301,7 +299,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool nativeMem = true, //!< Native memory objects
|
||||
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
|
||||
uint32_t sharedMemBytes = 0 //!< Shared memory size
|
||||
);
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& vcmd);
|
||||
void submitFillMemory(amd::FillMemoryCommand& vcmd);
|
||||
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
||||
@@ -331,20 +329,20 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Set the last known GPU event
|
||||
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
|
||||
bool flush = false //!< TRUE if flush is required
|
||||
);
|
||||
);
|
||||
|
||||
//! Flush DMA buffer on the specified engine
|
||||
void flushDMA(uint engineID //!< Engine ID for DMA flush
|
||||
);
|
||||
);
|
||||
|
||||
//! Wait for all engines on this Virtual GPU
|
||||
//! Returns TRUE if CPU didn't wait for GPU
|
||||
bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch
|
||||
);
|
||||
);
|
||||
|
||||
//! Waits for the latest GPU event with a lock to prevent multiple entries
|
||||
void waitEventLock(CommandBatch* cb //!< Command batch
|
||||
);
|
||||
);
|
||||
|
||||
//! Returns a resource associated with the constant buffer
|
||||
const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
|
||||
@@ -355,7 +353,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Start the command profiling
|
||||
void profilingBegin(amd::Command& command, //!< Command queue object
|
||||
bool drmProfiling = false //!< Measure DRM time
|
||||
);
|
||||
);
|
||||
|
||||
//! End the command profiling
|
||||
void profilingEnd(amd::Command& command);
|
||||
@@ -363,11 +361,11 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Collect the profiling results
|
||||
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
|
||||
const amd::Event* waitingEvent //!< Waiting event
|
||||
);
|
||||
);
|
||||
|
||||
//! Adds a memory handle into the GSL memory array for Virtual Heap
|
||||
inline void addVmMemory(const Memory* memory //!< GPU memory object
|
||||
);
|
||||
);
|
||||
|
||||
//! Adds the last submitted kernel to the queue for tracking a possible hang
|
||||
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
|
||||
@@ -377,7 +375,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void addDoppRef(const Memory* memory, //!< GPU memory object
|
||||
bool lastDoopCmd, //!< is the last submission for the pre-present primary
|
||||
bool pfpaDoppCmd //!< is a submission for the pre-present primary
|
||||
);
|
||||
);
|
||||
|
||||
//! Return xfer buffer for staging operations
|
||||
XferBuffer& xferWrite() { return writeBuffer_; }
|
||||
@@ -429,7 +427,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
//! Returns TRUE if virtual queue was successfully allocatted
|
||||
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
|
||||
);
|
||||
);
|
||||
|
||||
EngineType engineID_; //!< Engine ID for this VirtualGPU
|
||||
|
||||
@@ -447,7 +445,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Returns queue, associated with VirtualGPU
|
||||
Queue& queue(EngineType id) const { return *queues_[id]; }
|
||||
|
||||
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
|
||||
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
|
||||
bool flushL2 = false) const {
|
||||
Pal::BarrierInfo barrier = {};
|
||||
barrier.pipePointWaitCount = 1;
|
||||
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
||||
@@ -508,7 +507,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Returns TRUE if SDMA requires overlap synchronizaiton
|
||||
bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer
|
||||
const Resource& dst //!< Destination resource for SDMA transfer
|
||||
);
|
||||
);
|
||||
|
||||
//! Checks if RGP capture is enabled
|
||||
bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
|
||||
@@ -519,7 +518,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Creates buffer object from image
|
||||
amd::Memory* createBufferFromImage(
|
||||
amd::Memory& amdImage //! The parent image object(untiled images only)
|
||||
);
|
||||
);
|
||||
|
||||
private:
|
||||
struct MemoryRange {
|
||||
@@ -537,14 +536,14 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Awaits a command batch with a waiting event
|
||||
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
|
||||
const amd::Event* waitingEvent = nullptr //!< A waiting event
|
||||
);
|
||||
);
|
||||
|
||||
//! Detects memory dependency for HSAIL kernels and flushes caches
|
||||
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params, //!< Pointer to the param's store
|
||||
bool nativeMem, //!< Native memory objects
|
||||
size_t& ldsAddess //!< Returns LDS size, used in the kernel
|
||||
);
|
||||
size_t& ldsAddess //!< Returns LDS size, used in the kernel
|
||||
);
|
||||
|
||||
//! Common function for fill memory used by both svm Fill and non-svm fill
|
||||
bool fillMemory(cl_command_type type, //!< the command type
|
||||
@@ -553,7 +552,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
size_t patternSize, //!< pattern size
|
||||
const amd::Coord3D& origin, //!< memory origin
|
||||
const amd::Coord3D& size //!< memory size for filling
|
||||
);
|
||||
);
|
||||
|
||||
bool copyMemory(cl_command_type type, //!< the command type
|
||||
amd::Memory& srcMem, //!< source memory object
|
||||
@@ -564,35 +563,36 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const amd::Coord3D& size, //!< copy size
|
||||
const amd::BufferRect& srcRect, //!< region of source for copy
|
||||
const amd::BufferRect& dstRect //!< region of destination for copy
|
||||
);
|
||||
);
|
||||
|
||||
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
|
||||
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
|
||||
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
|
||||
);
|
||||
);
|
||||
|
||||
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
|
||||
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
||||
);
|
||||
);
|
||||
|
||||
void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel
|
||||
VirtualGPU* gpuDefQueue //!< Device queue for children execution
|
||||
);
|
||||
);
|
||||
|
||||
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
|
||||
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
||||
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
|
||||
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
|
||||
);
|
||||
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
|
||||
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
||||
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
|
||||
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
|
||||
);
|
||||
|
||||
void PostDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
|
||||
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
||||
VirtualGPU* gpuDefQueue, //!< GPU default queue
|
||||
uint64_t vmDefQueue, //!< VM handle to the virtual queue
|
||||
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
|
||||
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
|
||||
);
|
||||
void PostDeviceEnqueue(
|
||||
const amd::Kernel& kernel, //!< Parent amd kernel object
|
||||
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
||||
VirtualGPU* gpuDefQueue, //!< GPU default queue
|
||||
uint64_t vmDefQueue, //!< VM handle to the virtual queue
|
||||
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
|
||||
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
|
||||
);
|
||||
|
||||
Device& gpuDevice_; //!< physical GPU device
|
||||
amd::Monitor execution_; //!< Lock to serialise access to all device objects
|
||||
@@ -605,11 +605,11 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
|
||||
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
ManagedBuffer managedBuffer_; //!< Managed write buffer
|
||||
constbufs_t constBufs_; //!< constant buffers
|
||||
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
|
||||
ManagedBuffer managedBuffer_; //!< Managed write buffer
|
||||
constbufs_t constBufs_; //!< constant buffers
|
||||
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
|
||||
|
||||
typedef std::queue<CommandBatch*> CommandBatchQueue;
|
||||
CommandBatchQueue cbQueue_; //!< Queue of command batches
|
||||
@@ -617,12 +617,12 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
uint hwRing_; //!< HW ring used on this virtual device
|
||||
|
||||
State state_; //!< virtual GPU current state
|
||||
State state_; //!< virtual GPU current state
|
||||
GpuEvent events_[AllEngines]; //!< Last known GPU events
|
||||
|
||||
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
|
||||
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
|
||||
TimeStamp* profileTs_; //!< current profiling timestamp for command
|
||||
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
|
||||
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
|
||||
TimeStamp* profileTs_; //!< current profiling timestamp for command
|
||||
|
||||
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
|
||||
Memory* virtualQueue_; //!< Virtual device queue
|
||||
@@ -645,8 +645,7 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
|
||||
queues_[MainEngine]->last_kernel_ = &kernel;
|
||||
}
|
||||
|
||||
template <bool avoidBarrierSubmit>
|
||||
uint VirtualGPU::Queue::submit(bool forceFlush) {
|
||||
template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
|
||||
cmdCnt_++;
|
||||
uint id = cmdBufIdCurrent_;
|
||||
bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
|
||||
@@ -659,32 +658,30 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const T* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
||||
const T* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
memcpy(dst + offset, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint32_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint32_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint64_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint64_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
|
||||
}
|
||||
/*@}*/} // namespace pal
|
||||
/*@}*/ // namespace pal
|
||||
} // namespace pal
|
||||
|
||||
Ссылка в новой задаче
Block a user