P4 to Git Change 1780358 by gandryey@gera-win10 on 2019/05/08 18:46:22

SWDEV-79445 - OCL generic changes and code clean-up
	- Run google autoformat over the PAL backend. It will allow to enable autoformat in VS for the future changes.
	- No functional changes

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#78 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#93 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#73 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#60 edit


[ROCm/clr commit: 699a12bfa2]
Этот коммит содержится в:
foreman
2019-05-08 19:22:02 -04:00
родитель 9c9d74afaa
Коммит 5ea54a902a
33 изменённых файлов: 2119 добавлений и 2146 удалений
+3 -2
Просмотреть файл
@@ -11,8 +11,9 @@ namespace pal {
AppProfile::AppProfile()
: amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
propertyDataMap_.insert(
{"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
}
}
} // namespace pal
+1 -1
Просмотреть файл
@@ -20,4 +20,4 @@ class AppProfile : public amd::AppProfile {
bool enableHighPerformanceState_;
bool reportAsOCL12Device_;
};
}
} // namespace pal
+53 -53
Просмотреть файл
@@ -280,8 +280,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary buffer, using CPU
if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset,
src, copySize, flags)) {
if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset, src, copySize,
flags)) {
return false;
}
@@ -296,7 +296,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
srcOffset += tmpSize;
if ((srcOffset + tmpSize) > gpu().xferWrite().MaxSize()) {
srcOffset = 0;
flags = 0;
flags = 0;
} else {
flags = Resource::NoWait;
}
@@ -310,7 +310,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
} else {
@@ -335,7 +335,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
// Copy memory, using pinning
while (dstSize > 0) {
size_t tmpSize;
// If it's the first iterarion, then readjust the copy size
// If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
@@ -398,7 +398,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
(dstMemory.isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
} else {
@@ -586,8 +586,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
entire, rowPitch, slicePitch);
} else {
// Use PAL path for a transfer
result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin,
size, gpuMem(dstMemory));
result =
gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
@@ -947,8 +947,8 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
void* param = kernel->parameters().values() + desc.offset_;
assert((desc.type_ == T_POINTER || value != NULL ||
(desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
"not a valid local mem arg");
(desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
"not a valid local mem arg");
uint32_t uint32_value = 0;
uint64_t uint64_value = 0;
@@ -957,14 +957,15 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
nullptr;
} else {
// convert cl_mem to amd::Memory*, return false if invalid.
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
*static_cast<Memory* const*>(value))->virtualAddress());
LP64_SWITCH(uint32_value, uint64_value) =
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
*static_cast<Memory* const*>(value);
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
*static_cast<Memory* const*>(value);
// Note: Special case for image SRD, which is 64 bit always
if (LP64_SWITCH(true, false) &&
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
@@ -1018,8 +1019,8 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
bool releaseView = false;
bool result = false;
amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_);
bool swapLayer = (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
dev().settings().gfx10Plus_;
bool swapLayer =
(dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1078,10 +1079,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1114,10 +1115,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
dstOrg[2] = dstOrg[1];
dstOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
dstOrg[2] = dstOrg[1];
dstOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1338,8 +1339,8 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
bool releaseView = false;
bool result = false;
amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
bool swapLayer = (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
dev().settings().gfx10Plus_;
bool swapLayer =
(srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1398,10 +1399,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1426,10 +1427,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
srcOrg[2] = srcOrg[1];
srcOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
srcOrg[2] = srcOrg[1];
srcOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
}
setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
uint32_t memFmtSize = gpuMem(srcMemory).elementSize();
@@ -1570,7 +1571,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
// Program source origin
cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
dev().settings().gfx10Plus_) {
dev().settings().gfx10Plus_) {
srcOrg[3] = 1;
}
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
@@ -1578,7 +1579,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
// Program destinaiton origin
cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0};
if ((gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
dev().settings().gfx10Plus_) {
dev().settings().gfx10Plus_) {
dstOrg[3] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1700,16 +1701,15 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
amdMemory = pinHostMemory(srcHost, pinSize, partial);
if (amdMemory == nullptr) {
// Force SW copy
result = HostBlitManager::writeImage(srcHost, dstMemory,
origin, size, rowPitch, slicePitch, entire);
result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
entire);
synchronize();
return result;
}
// Get device memory for this virtual device
srcMemory = dev().getGpuMemory(amdMemory);
pinned = true;
}
else {
} else {
srcMemory = &gpu().xferWrite().Acquire(pinSize);
srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait);
pinned = false;
@@ -1951,7 +1951,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
synchronize();
@@ -2002,7 +2002,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
@@ -2206,8 +2206,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
size_t localWorkSize[3];
Memory* memView = &gpuMem(memory);
amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
bool swapLayer = (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
dev().settings().gfx10Plus_;
bool swapLayer =
(memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;
// Program the kernels workload depending on the fill dimensions
fillType = FillImage;
@@ -2274,10 +2274,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
// Swap the Y and Z components, apparently gfx10 HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
} else {
globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
@@ -2297,10 +2297,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0};
cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
if (swapLayer) {
fillOrigin[2] = fillOrigin[1];
fillOrigin[1] = 0;
fillSize[2] = fillSize[1];
fillSize[1] = 1;
fillOrigin[2] = fillOrigin[1];
fillOrigin[1] = 0;
fillSize[2] = fillSize[1];
fillSize[1] = 1;
}
setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
+5 -4
Просмотреть файл
@@ -27,7 +27,7 @@ class DmaBlitManager : public device::HostBlitManager {
//! Constructor
DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
);
//! Destructor
virtual ~DmaBlitManager() {}
@@ -211,7 +211,7 @@ class KernelBlitManager : public DmaBlitManager {
//! Constructor
KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
);
//! Destructor
virtual ~KernelBlitManager();
@@ -382,7 +382,7 @@ class KernelBlitManager : public DmaBlitManager {
//! Creates a program for all blit operations
bool createProgram(Device& device //!< Device object
);
);
//! Creates a view memory object
Memory* createView(const Memory& parent, //!< Parent memory object
@@ -409,4 +409,5 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"fillImage", "scheduler",
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+17 -24
Просмотреть файл
@@ -11,12 +11,12 @@ namespace pal {
// ================================================================================================
ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
: gpu_(gpu)
, pool_(MaxNumberOfBuffers)
, activeBuffer_(0)
, size_(size)
, wrtOffset_(0)
, wrtAddress_(nullptr) {}
: gpu_(gpu),
pool_(MaxNumberOfBuffers),
activeBuffer_(0),
size_(size),
wrtOffset_(0),
wrtAddress_(nullptr) {}
// ================================================================================================
void ManagedBuffer::release() {
@@ -40,8 +40,8 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
pool_[i].buf->memRef()->gpu_ = &gpu_;
void* wrtAddress = pool_[i].buf->map(&gpu_);
if (wrtAddress == nullptr) {
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
return false;
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
return false;
}
// Make sure OCL touches every buffer in the queue to avoid delays on the first submit
uint dummy = 0;
@@ -94,15 +94,10 @@ void ManagedBuffer::pinGpuEvent() {
// ================================================================================================
ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
: mbuf_(mbuf)
, sys_mem_copy_(nullptr)
, size_(size)
{}
: mbuf_(mbuf), sys_mem_copy_(nullptr), size_(size) {}
// ================================================================================================
ConstantBuffer::~ConstantBuffer() {
amd::AlignedMemory::deallocate(sys_mem_copy_);
}
ConstantBuffer::~ConstantBuffer() { amd::AlignedMemory::deallocate(sys_mem_copy_); }
// ================================================================================================
bool ConstantBuffer::Create() {
@@ -118,8 +113,8 @@ bool ConstantBuffer::Create() {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sys_mem_copy_, size);
return vm_address;
@@ -127,8 +122,8 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sysmem, size);
return vm_address;
@@ -136,9 +131,7 @@ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const
// ================================================================================================
XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
: buffer_view_(device, size)
, mbuf_(mbuf)
, size_(size) {
: buffer_view_(device, size), mbuf_(mbuf), size_(size) {
// Create a view for access
Resource::ViewParams params = {};
params.gpu_ = &mbuf_.gpu();
@@ -151,9 +144,9 @@ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
// ================================================================================================
Memory& XferBuffer::Acquire(uint32_t size) {
uint64_t vm_address;
uint64_t vm_address;
// Reserve space in the managed buffer
address cpu_address = mbuf_.reserve(size, &vm_address);
address cpu_address = mbuf_.reserve(size, &vm_address);
// Update a view for access
buffer_view_.updateView(mbuf_.activeMemory(), vm_address - mbuf_.vmAddress(), size);
return buffer_view_;
+42 -39
Просмотреть файл
@@ -12,9 +12,9 @@ namespace pal {
class ManagedBuffer : public amd::EmbeddedObject {
public:
//! Constructor for the ConstBuffer class
ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
uint32_t size //!< size of the managed buffers in bytes
);
ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
uint32_t size //!< size of the managed buffers in bytes
);
~ManagedBuffer() {}
//! Creates the managed buffers
@@ -50,8 +50,8 @@ class ManagedBuffer : public amd::EmbeddedObject {
private:
struct TimeStampedBuffer {
Memory* buf;
GpuEvent events[AllEngines];
Memory* buf;
GpuEvent events[AllEngines];
};
//! The maximum number of the managed buffers
@@ -63,21 +63,21 @@ class ManagedBuffer : public amd::EmbeddedObject {
//! Disable operator=
ManagedBuffer& operator=(const ManagedBuffer&) = delete;
VirtualGPU& gpu_; //!< Virtual GPU object
std::vector<TimeStampedBuffer> pool_; //!< Buffers for management
uint32_t activeBuffer_; //!< Current active buffer
uint32_t size_; //!< Constant buffer size
uint32_t wrtOffset_; //!< Current write offset
address wrtAddress_; //!< Write address in CB
VirtualGPU& gpu_; //!< Virtual GPU object
std::vector<TimeStampedBuffer> pool_; //!< Buffers for management
uint32_t activeBuffer_; //!< Current active buffer
uint32_t size_; //!< Constant buffer size
uint32_t wrtOffset_; //!< Current write offset
address wrtAddress_; //!< Write address in CB
};
//! Constant buffer
class ConstantBuffer : public amd::HeapObject {
public:
public:
//! Constructor for the ConstBuffer class
ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer
uint32_t size //!< Max size of the constant buffer
);
uint32_t size //!< Max size of the constant buffer
);
//! Destructor for the ConstBuffer class
~ConstantBuffer();
@@ -86,18 +86,18 @@ public:
bool Create();
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
*
* \return GPU address for the uploaded data
*/
*
* \return GPU address for the uploaded data
*/
uint64_t UploadDataToHw(uint32_t size //!< real data size for upload
) const;
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
*
* \return GPU address for the uploaded data
*/
*
* \return GPU address for the uploaded data
*/
uint64_t UploadDataToHw(const void* sysmem, //!< Pointer to the data for upload
uint32_t size //!< Real data size for upload
uint32_t size //!< Real data size for upload
) const;
//! Returns a pointer to the system memory copy for CB
@@ -106,52 +106,55 @@ public:
//! Returns active GPU buffer
Memory* ActiveMemory() const { return mbuf_.activeMemory(); }
private:
private:
//! Disable copy constructor
ConstantBuffer(const ConstantBuffer&) = delete;
//! Disable operator=
ConstantBuffer& operator=(const ConstantBuffer&) = delete;
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
address sys_mem_copy_; //!< System memory copy
uint32_t size_; //!< Constant buffer size
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
address sys_mem_copy_; //!< System memory copy
uint32_t size_; //!< Constant buffer size
};
//! Staging buffer
class XferBuffer : public amd::EmbeddedObject {
public:
public:
//! Constructor for the ConstBuffer class
XferBuffer(const Device& device, //!< Active GPU device
XferBuffer(const Device& device, //!< Active GPU device
ManagedBuffer& mbuf, //!< Managed buffer
uint32_t size //!< Maximum size of the transfer buffer
uint32_t size //!< Maximum size of the transfer buffer
);
//! Destructor for the ConstBuffer class
~XferBuffer() {}
/*! \brief Acquires free memory from the managed buffer
*
* \return GPU memory object associated with free memory
*/
Memory& Acquire(uint32_t size //!< data size for transfers
);
*
* \return GPU memory object associated with free memory
*/
Memory& Acquire(uint32_t size //!< data size for transfers
);
//! Releases memory object used in the staging transfer
void Release(Memory& mem //!< Memory object for release
) { buffer_view_.updateView(nullptr, 0, 0); }
) {
buffer_view_.updateView(nullptr, 0, 0);
}
size_t MaxSize() const { return static_cast<size_t>(size_); }
private:
private:
//! Disable copy constructor
XferBuffer(const XferBuffer&) = delete;
//! Disable operator=
XferBuffer& operator=(const XferBuffer&) = delete;
Memory buffer_view_; //!< Buffer view returned in the acquire
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
uint32_t size_; //!< Mx staging buffer size
Memory buffer_view_; //!< Buffer view returned in the acquire
ManagedBuffer& mbuf_; //!< Managed buffer on GPU
uint32_t size_; //!< Mx staging buffer size
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+6 -6
Просмотреть файл
@@ -676,12 +676,12 @@ void PerfCounter::convertInfo() {
break;
case Pal::GfxIpLevel::GfxIp10:
case Pal::GfxIpLevel::GfxIp10_1:
if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
auto p = gfx10BlockIdPal[info_.blockIndex_];
info_.blockIndex_ = std::get<0>(p);
info_.counterIndex_ = std::get<1>(p);
}
break;
if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
auto p = gfx10BlockIdPal[info_.blockIndex_];
info_.blockIndex_ = std::get<0>(p);
info_.counterIndex_ = std::get<1>(p);
}
break;
default:
Unimplemented();
break;
+1 -2
Просмотреть файл
@@ -84,8 +84,7 @@ class PerfCounter : public device::PerfCounter {
cl_uint blockIndex, //!< HW block index
cl_uint counterIndex, //!< Counter index within the block
cl_uint eventIndex) //!< Event index for profiling
: gpuDevice_(device),
palRef_(palRef) {
: gpuDevice_(device), palRef_(palRef) {
info_.blockIndex_ = blockIndex;
info_.counterIndex_ = counterIndex;
info_.eventIndex_ = eventIndex;
+4 -4
Просмотреть файл
@@ -98,10 +98,10 @@ struct HwDebugWaveAddr {
};
/*! \brief Kernel code information
*
* This structure contains the pointer of mapped kernel code for host access
* and its size (in bytes)
*/
*
* This structure contains the pointer of mapped kernel code for host access
* and its size (in bytes)
*/
struct AqlCodeInfo {
amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access
uint32_t aqlCodeSize_; //! size of AQL code
+1 -1
Просмотреть файл
@@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() {
void GpuDebugManager::flushCache(uint32_t mask) {
HwDbgGpuCacheMask cacheMask(mask);
//device()->xferQueue()->flushCuCaches(cacheMask);
// device()->xferQueue()->flushCuCaches(cacheMask);
}
+100 -74
Просмотреть файл
@@ -47,9 +47,9 @@ struct GpuEvent {
static constexpr uint32_t InvalidID = ((1 << 30) - 1);
struct {
uint32_t id_ : 30; ///< Actual event id
uint32_t modified_ : 1; ///< Resource associated with the event was modified
uint32_t engineId_ : 1; ///< Type of the id
uint32_t id_ : 30; ///< Actual event id
uint32_t modified_ : 1; ///< Resource associated with the event was modified
uint32_t engineId_ : 1; ///< Type of the id
};
//! GPU event default constructor
GpuEvent() : id_(InvalidID), modified_(false), engineId_(MainEngine) {}
@@ -63,8 +63,11 @@ struct GpuEvent {
void invalidate() { id_ = InvalidID; }
// Overwrite default assign operator to preserve modified_ field
GpuEvent& operator=(const GpuEvent& evt)
{ id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
GpuEvent& operator=(const GpuEvent& evt) {
id_ = evt.id_;
engineId_ = evt.engineId_;
return *this;
}
};
/*! \addtogroup PAL
@@ -113,87 +116,110 @@ const static uint HsaSamplerObjectAlignment = 16;
const static uint DeviceQueueMaskSize = 32;
struct AMDDeviceInfo {
const char* targetName_; //!< Target name
const char* machineTarget_; //!< Machine target
const char* machineTargetLC_;//!< Machine target for LC
uint simdPerCU_; //!< Number of SIMDs per CU
uint simdWidth_; //!< Number of workitems processed per SIMD
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
uint memChannelBankWidth_; //!< Memory channel bank width
uint localMemSizePerCU_; //!< Local memory size per CU
uint localMemBanks_; //!< Number of banks of local memory
uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
uint gfxipVersion_; //!< The core engine GFXIP version
bool xnackEnabled_; //!< Enable XNACK feature
const char* targetName_; //!< Target name
const char* machineTarget_; //!< Machine target
const char* machineTargetLC_; //!< Machine target for LC
uint simdPerCU_; //!< Number of SIMDs per CU
uint simdWidth_; //!< Number of workitems processed per SIMD
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
uint memChannelBankWidth_; //!< Memory channel bank width
uint localMemSizePerCU_; //!< Local memory size per CU
uint localMemBanks_; //!< Number of banks of local memory
uint gfxipVersionLC_; //!< The core engine GFXIP version for LC
uint gfxipVersion_; //!< The core engine GFXIP version
bool xnackEnabled_; //!< Enable XNACK feature
};
static const AMDDeviceInfo DeviceInfo[] = {
/* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
/* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
/* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
/* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
/* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
/* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
/* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
/* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
/* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
/* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
/* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
/* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
/* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801,false},
/* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
/* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
/* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
/* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
/* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
/* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
/* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
/* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Ellesmere */ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
/* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
/* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Ellesmere */
{"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
/* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
};
// Ordering as per AsicRevision# in //depot/stg/pal/inc/core/palDevice.h and
// http://confluence.amd.com/pages/viewpage.action?spaceKey=ASLC&title=AMDGPU+Target+Names
static const AMDDeviceInfo Gfx9PlusSubDeviceInfo[] = {
/* Vega10 */{"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
/* Vega10 XNACK */{ LIGHTNING_SWITCH("gfx900","gfx901"), "gfx901", "gfx900",
4, 16, 1, 256, 64 * Ki, 32, 900, 901, true},
/* Vega12 */{"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
/* Vega12 XNACK */{ LIGHTNING_SWITCH("gfx904","gfx905"), "gfx905", "gfx904",
4, 16, 1, 256, 64 * Ki, 32, 904, 905, true},
/* Vega20 */{"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
/* Vega20 XNACK */{ LIGHTNING_SWITCH("gfx906","gfx907"), "gfx907", "gfx906",
4, 16, 1, 256, 64 * Ki, 32, 906, 907, true},
/* Raven */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Raven XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
/* Raven2 */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Raven2 XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
/* Renoir */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Renoir XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
/* Navi10_A0 */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false },
/* Navi10_A0 XNACK */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true },
/* Navi10 */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
/* Navi10 XNACK */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
/* Navi10Lite */{"gfx1000", "gfx1000","gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
/* Navi10Lite XNACK */{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
/* Navi12 */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
/* Navi12 XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
/* Navi12Lite */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
/* Navi12Lite XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
/* Navi14 */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false },
/* Navi14 XNACK */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true },
/* UnknownDevice3 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
/* UnknownDevice3 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
/* UnknownDevice2 */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
/* UnknownDevice2 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
/* Vega10 */ {"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
/* Vega10 XNACK */
{LIGHTNING_SWITCH("gfx900", "gfx901"), "gfx901", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 901,
true},
/* Vega12 */ {"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
/* Vega12 XNACK */
{LIGHTNING_SWITCH("gfx904", "gfx905"), "gfx905", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 905,
true},
/* Vega20 */ {"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
/* Vega20 XNACK */
{LIGHTNING_SWITCH("gfx906", "gfx907"), "gfx907", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 907,
true},
/* Raven */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Raven XNACK */
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
true},
/* Raven2 */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Raven2 XNACK */
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
true},
/* Renoir */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
/* Renoir XNACK */
{LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
true},
/* Navi10_A0 */
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
/* Navi10_A0 XNACK */
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
/* Navi10 */
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
/* Navi10 XNACK */
{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
/* Navi10Lite */
{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
/* Navi10Lite XNACK */
{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
/* Navi12 */
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
/* Navi12 XNACK */
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
/* Navi12Lite */
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
/* Navi12Lite XNACK */
{"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
/* Navi14 */
{"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false},
/* Navi14 XNACK */
{"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true},
/* UnknownDevice3 */
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
/* UnknownDevice3 XNACK */
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
/* UnknownDevice2 */
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
/* UnknownDevice2 XNACK */
{"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
};
+203 -207
Просмотреть файл
@@ -53,15 +53,14 @@ void PalDeviceUnload() { pal::Device::tearDown(); }
namespace pal {
Util::GenericAllocator NullDevice::allocator_;
Util::GenericAllocator NullDevice::allocator_;
char* Device::platformObj_;
Pal::IPlatform* Device::platform_;
Pal::IPlatform* Device::platform_;
NullDevice::Compiler* NullDevice::compiler_;
AppProfile Device::appProfile_;
NullDevice::NullDevice()
: amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
NullDevice::NullDevice() : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
bool NullDevice::init() {
std::vector<Device*> devices;
@@ -89,8 +88,8 @@ bool NullDevice::init() {
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
if (driverVersion.find("PAL") != std::string::npos) {
if (static_cast<NullDevice*>(devices[i])->asicRevision() == revision) {
foundActive = true;
break;
foundActive = true;
break;
}
}
}
@@ -109,132 +108,130 @@ bool NullDevice::init() {
}
}
}
#endif // defined(WITH_COMPILER_LIB)
#endif // defined(WITH_COMPILER_LIB)
// Loop through all supported devices and create each of them
for (uint id = 0;
id < sizeof(Gfx9PlusSubDeviceInfo)/sizeof(AMDDeviceInfo); ++id) {
bool foundActive = false;
bool foundDuplicate = false;
uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
for (uint id = 0; id < sizeof(Gfx9PlusSubDeviceInfo) / sizeof(AMDDeviceInfo); ++id) {
bool foundActive = false;
bool foundDuplicate = false;
uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
: pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
continue;
}
if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
continue;
}
// Loop through all active PAL devices and see if we match one
for (uint i = 0; i < devices.size(); ++i) {
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
if (driverVersion.find("PAL") != std::string::npos) {
gfxipVersion = devices[i]->settings().useLightning_ ?
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
uint gfxIpCurrent = devices[i]->settings().useLightning_ ?
static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_ :
static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
if (gfxIpCurrent == gfxipVersion) {
foundActive = true;
break;
}
// Loop through all active PAL devices and see if we match one
for (uint i = 0; i < devices.size(); ++i) {
driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
if (driverVersion.find("PAL") != std::string::npos) {
gfxipVersion = devices[i]->settings().useLightning_
? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
: pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
uint gfxIpCurrent = devices[i]->settings().useLightning_
? static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_
: static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
if (gfxIpCurrent == gfxipVersion) {
foundActive = true;
break;
}
}
}
// Don't report an offline device if it's active
if (foundActive) {
continue;
// Don't report an offline device if it's active
if (foundActive) {
continue;
}
// Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
// and compare them with the current entry to see if the current entry
// was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
// means the current entry already has been added in the offline device list
for (uint j = 0; j < id; ++j) {
if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
continue;
}
// Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
// and compare them with the current entry to see if the current entry
// was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
// means the current entry already has been added in the offline device list
for (uint j = 0; j < id; ++j) {
if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
continue;
}
if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
foundDuplicate = true;
break;
}
if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
foundDuplicate = true;
break;
}
}
// Don't report an offline device twice
if (foundDuplicate) {
continue;
}
// Don't report an offline device twice
if (foundDuplicate) {
continue;
}
Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
uint ipLevelMajor = round(gfxipVersion / 100);
uint ipLevelMinor = round(gfxipVersion / 10 % 10);
switch (ipLevelMajor) {
Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
uint ipLevelMajor = round(gfxipVersion / 100);
uint ipLevelMinor = round(gfxipVersion / 10 % 10);
switch (ipLevelMajor) {
case 9:
ipLevel = Pal::GfxIpLevel::GfxIp9;
break;
ipLevel = Pal::GfxIpLevel::GfxIp9;
break;
case 10:
switch (ipLevelMinor) {
case 0:
ipLevel = Pal::GfxIpLevel::GfxIp10;
break;
case 1:
ipLevel = Pal::GfxIpLevel::GfxIp10_1;
break;
case 2:
ipLevel = Pal::GfxIpLevel::GfxIp10_2;
break;
case 3:
ipLevel = Pal::GfxIpLevel::GfxIp10_3;
break;
case 0:
ipLevel = Pal::GfxIpLevel::GfxIp10;
break;
case 1:
ipLevel = Pal::GfxIpLevel::GfxIp10_1;
break;
case 2:
ipLevel = Pal::GfxIpLevel::GfxIp10_2;
break;
case 3:
ipLevel = Pal::GfxIpLevel::GfxIp10_3;
break;
}
}
}
Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
switch (gfxipVersion) {
switch (gfxipVersion) {
case 901:
case 900:
revision = Pal::AsicRevision::Vega10;
break;
revision = Pal::AsicRevision::Vega10;
break;
case 903:
case 902:
revision = Pal::AsicRevision::Raven;
break;
revision = Pal::AsicRevision::Raven;
break;
case 905:
case 904:
revision = Pal::AsicRevision::Vega12;
break;
revision = Pal::AsicRevision::Vega12;
break;
case 907:
case 906:
revision = Pal::AsicRevision::Vega20;
break;
revision = Pal::AsicRevision::Vega20;
break;
case 1000:
revision = Pal::AsicRevision::Navi10Lite;
break;
revision = Pal::AsicRevision::Navi10Lite;
break;
case 1010:
revision = Pal::AsicRevision::Navi10;
break;
revision = Pal::AsicRevision::Navi10;
break;
case 1011:
revision = Pal::AsicRevision::Navi12;
break;
revision = Pal::AsicRevision::Navi12;
break;
case 1012:
revision = Pal::AsicRevision::Navi14;
break;
revision = Pal::AsicRevision::Navi14;
break;
case 1030:
ShouldNotReachHere();
break;
}
ShouldNotReachHere();
break;
}
NullDevice* dev = new NullDevice();
if (nullptr != dev) {
if (!dev->create(revision, ipLevel, xNACKSupported)) {
delete dev;
}
else {
dev->registerDevice();
}
NullDevice* dev = new NullDevice();
if (nullptr != dev) {
if (!dev->create(revision, ipLevel, xNACKSupported)) {
delete dev;
} else {
dev->registerDevice();
}
}
}
return true;
@@ -257,10 +254,10 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) {
hwInfo_ = &DeviceInfo[static_cast<uint>(asicRevision)];
} else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) {
subtarget = (static_cast<uint>(asicRevision_) %
static_cast<uint>(Pal::AsicRevision::Vega10))
<< 1 | xNACKSupported;
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
<< 1 |
xNACKSupported;
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
} else {
return false;
@@ -271,8 +268,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
// Report 512MB for all offline devices
Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
heaps[Pal::GpuHeapLocal].heapSize =
heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
heaps[Pal::GpuHeapLocal].heapSize = heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
Pal::WorkStationCaps wscaps = {};
@@ -295,7 +291,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64;
if (settings().useLightning_) {
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
// create compilation object with cache support
int gfxipMajor = hwInfo_->gfxipVersionLC_ / 100;
int gfxipMinor = hwInfo_->gfxipVersionLC_ / 10 % 10;
@@ -323,16 +319,16 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
cacheCompilation_.reset(compObj);
#endif
} else {
#if defined(WITH_COMPILER_LIB)
#if defined(WITH_COMPILER_LIB)
const char* library = getenv("HSA_COMPILER_LIBRARY");
aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
library,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB };
aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
library,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB};
// Initialize the compiler handle
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
@@ -370,9 +366,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.maxWorkItemDimensions_ = 3;
info_.maxComputeUnits_ = settings().enableWgpMode_ ?
palProp.gfxipProperties.shaderCore.numAvailableCus / 2 :
palProp.gfxipProperties.shaderCore.numAvailableCus;
info_.maxComputeUnits_ = settings().enableWgpMode_
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
: palProp.gfxipProperties.shaderCore.numAvailableCus;
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
@@ -427,7 +423,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
if (GPU_ADD_HBCC_SIZE) {
localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize;
} else {
localRAM = heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
localRAM =
heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
}
info_.globalMemSize_ = (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
@@ -445,10 +442,10 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
// Find the largest heap form FB memory
if (GPU_ADD_HBCC_SIZE) {
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize),
cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
} else {
info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].physicalHeapSize),
cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
}
#if defined(ATI_OS_WIN)
@@ -561,7 +558,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)",
settings().useLightning_ ? ",LC" : ",HSAIL");
settings().useLightning_ ? ",LC" : ",HSAIL");
info_.profile_ = "FULL_PROFILE";
if (settings().oclVersion_ >= OpenCL20) {
@@ -640,15 +637,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32:
palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
info_.wavefrontWidth_ =
settings().enableWave32Mode_ ? 32 : palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs;
info_.globalMemChannelBanks_ = 4;
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
info_.localMemBanks_ = hwInfo()->localMemBanks_;
info_.gfxipVersion_ = settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
info_.gfxipVersion_ =
settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
info_.timeStampFrequency_ = 1000000;
info_.numAsyncQueues_ = numComputeRings;
@@ -661,7 +659,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.pcieDeviceId_ = palProp.deviceId;
info_.pcieRevisionId_ = palProp.revisionId;
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * hwInfo()->simdPerCU_ *
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
}
}
@@ -789,8 +787,7 @@ Device::Device()
globalScratchBuf_(nullptr),
srdManager_(nullptr),
resourceList_(nullptr),
rgpCaptureMgr_(nullptr)
{}
rgpCaptureMgr_(nullptr) {}
Device::~Device() {
// remove the HW debug manager
@@ -803,8 +800,8 @@ Device::~Device() {
}
if (glb_ctx_ != nullptr) {
glb_ctx_->release();
glb_ctx_ = nullptr;
glb_ctx_->release();
glb_ctx_ = nullptr;
}
delete srdManager_;
@@ -878,19 +875,21 @@ bool Device::create(Pal::IDevice* device) {
ipLevel_ = properties().gfxLevel;
asicRevision_ = properties().revision;
// XNACK flag should be set for PageMigration | IOMMUv2 Support
uint isXNACKSupported = static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled
|| properties_.gpuMemoryProperties.flags.iommuv2Support);
// XNACK flag should be set for PageMigration | IOMMUv2 Support
uint isXNACKSupported =
static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled ||
properties_.gpuMemoryProperties.flags.iommuv2Support);
uint subtarget = isXNACKSupported;
// Update HW info for the device
if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Polaris12)) {
hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
} else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) {
// For compiler sub targets
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10)) << 1 |
subtarget;
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
// For compiler sub targets
subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
<< 1 |
subtarget;
hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
} else {
return false;
}
@@ -995,7 +994,7 @@ bool Device::create(Pal::IDevice* device) {
}
if (settings().useLightning_) {
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
// create compilation object with cache support
int gfxipMajor = hwInfo()->gfxipVersionLC_ / 100;
int gfxipMinor = hwInfo()->gfxipVersionLC_ / 10 % 10;
@@ -1013,7 +1012,7 @@ bool Device::create(Pal::IDevice* device) {
}
amd::CacheCompilation* compObj = new amd::CacheCompilation(
cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
if (!compObj) {
LogError("Unable to create cache compilation object!");
return false;
@@ -1021,18 +1020,17 @@ bool Device::create(Pal::IDevice* device) {
cacheCompilation_.reset(compObj);
#endif
}
else {
#if defined(WITH_COMPILER_LIB)
} else {
#if defined(WITH_COMPILER_LIB)
const char* library = getenv("HSA_COMPILER_LIBRARY");
aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
library,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB };
aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
library,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
AMD_OCL_SC_LIB};
// Initialize the compiler handle
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
@@ -1056,7 +1054,7 @@ bool Device::create(Pal::IDevice* device) {
if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) {
std::vector<amd::Device*> devices;
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
// Add all PAL devices
for (uint32_t i = gStartDevice; i < numDevices; ++i) {
devices.push_back(amd::Device::devices()[i]);
@@ -1070,8 +1068,8 @@ bool Device::create(Pal::IDevice* device) {
if (glb_ctx_ == nullptr) {
return false;
}
amd::Buffer* buf =
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
amd::Buffer* buf =
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
if ((buf != nullptr) && buf->create()) {
p2p_stage_ = buf;
} else {
@@ -1086,11 +1084,8 @@ bool Device::create(Pal::IDevice* device) {
// =====================================================================================================================
// Master function that handles developer callbacks from PAL.
void PAL_STDCALL Device::PalDeveloperCallback(
void* pPrivateData,
const Pal::uint32 deviceIndex,
Pal::Developer::CallbackType type,
void* pCbData) {
void PAL_STDCALL Device::PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
Pal::Developer::CallbackType type, void* pCbData) {
Device* device = static_cast<Device*>(pPrivateData);
const auto& barrier = *static_cast<const Pal::Developer::BarrierData*>(pCbData);
@@ -1099,7 +1094,7 @@ void PAL_STDCALL Device::PalDeveloperCallback(
VirtualGPU* gpu = nullptr;
if (pBarrierData->pCmdBuffer != nullptr) {
// Find which queue the current command buffer belongs
for (const auto& it: device->vgpus()) {
for (const auto& it : device->vgpus()) {
if (it->isActiveCmd(pBarrierData->pCmdBuffer)) {
gpu = it;
break;
@@ -1112,18 +1107,18 @@ void PAL_STDCALL Device::PalDeveloperCallback(
}
switch (type) {
case Pal::Developer::CallbackType::BarrierBegin:
device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
break;
case Pal::Developer::CallbackType::BarrierEnd:
device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
break;
case Pal::Developer::CallbackType::ImageBarrier:
assert(false);
break;
case Pal::Developer::CallbackType::DrawDispatch:
case Pal::Developer::CallbackType::BarrierBegin:
device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
break;
default:
case Pal::Developer::CallbackType::BarrierEnd:
device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
break;
case Pal::Developer::CallbackType::ImageBarrier:
assert(false);
break;
case Pal::Developer::CallbackType::DrawDispatch:
break;
default:
break;
}
}
@@ -1136,15 +1131,16 @@ bool Device::initializeHeapResources() {
// Request all compute engines
finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines =
((1 << numComputeEngines_) - 1);
for (const auto& it: exclusiveComputeEnginesId_) {
for (const auto& it : exclusiveComputeEnginesId_) {
// Request real time compute engines
finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= (1 << it.second);
finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |=
(1 << it.second);
}
// Request all SDMA engines
finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1;
if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
return false;
return false;
}
heapInitComplete_ = true;
@@ -1201,7 +1197,8 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
if (queue != nullptr) {
profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
if (queue->asHostQueue() != nullptr) {
bool interopQueue = (0 != (queue->context().info().flags_ &
bool interopQueue = (0 !=
(queue->context().info().flags_ &
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
amd::Context::D3D11DeviceKhr)));
rtCUs = queue->rtCUs();
@@ -1233,8 +1230,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
device::Program* program;
if (settings().useLightning_) {
program = new LightningProgram(*this);
}
else {
} else {
program = new HSAILProgram(*this);
}
if (program == nullptr) {
@@ -1249,9 +1245,7 @@ typedef std::unordered_map<int, bool> requestedDevices_t;
//! Parses the requested list of devices to be exposed to the user.
static void parseRequestedDeviceList(const char* requestedDeviceList,
requestedDevices_t& requestedDevices,
uint32_t numDevices) {
requestedDevices_t& requestedDevices, uint32_t numDevices) {
char* pch = strtok(const_cast<char*>(requestedDeviceList), ",");
while (pch != nullptr) {
bool deviceIdValid = true;
@@ -1263,8 +1257,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList,
break;
}
}
if (currentDeviceIndex < 0 ||
static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
if (currentDeviceIndex < 0 || static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
deviceIdValid = false;
}
// Get next token.
@@ -1310,9 +1303,9 @@ bool Device::init() {
// Count up all the devices in the system.
platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);
const char* requestedDeviceList = amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ?
HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
const char* requestedDeviceList = amd::IS_HIP
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
if (requestedDeviceList[0] != '\0') {
useDeviceList = true;
@@ -1465,8 +1458,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
if (result) {
// Disallow permanent map for Win7 only, since OS will move buffer to sysmem
if (IS_LINUX ||
// Or Win10
(properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
// Or Win10
(properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
void* address = gpuMemory->map(nullptr);
CondLog(address == nullptr, "PAL failed lock of persistent memory!");
}
@@ -1697,9 +1690,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
(memory->memoryType() != Resource::ExternalPhysical) &&
((owner.getHostMem() != nullptr) ||
((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
? owner.getHostMemRef()->size()
: owner.getSize());
bool ok = memory->pinSystemMemory(
owner.getHostMem(),
(owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
//! \note: Ignore the pinning result for now
}
@@ -1720,9 +1713,9 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const {
assert((owner.asImage() != nullptr) && "View supports images only");
const amd::Image& image = *owner.asImage();
pal::Memory* gpuImage = new pal::Image(
*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
image.getImageFormat(), image.getType(), image.getMipLevels());
pal::Memory* gpuImage =
new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
image.getImageFormat(), image.getType(), image.getMipLevels());
// Create resource
if (nullptr != gpuImage) {
@@ -1827,19 +1820,18 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
Pal::gpusize invisible = allocedMem[Pal::GpuHeapInvisible] - resourceCache().lclCacheSize();
// Fill free memory info
freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
(local + invisible)) / Ki);
freeMemory[TotalFreeMemory] =
static_cast<size_t>((info().globalMemSize_ - (local + invisible)) / Ki);
if (invisible >= heaps_[Pal::GpuHeapInvisible].heapSize) {
invisible = 0;
}
else {
} else {
invisible = heaps_[Pal::GpuHeapInvisible].heapSize - invisible;
}
freeMemory[LargestFreeBlock] = static_cast<size_t>(invisible) / Ki;
if (settings().apuSystem_) {
Pal::gpusize sysMem = allocedMem[Pal::GpuHeapGartCacheable] + allocedMem[Pal::GpuHeapGartUswc] -
resourceCache().cacheSize() + resourceCache().lclCacheSize();
resourceCache().cacheSize() + resourceCache().lclCacheSize();
sysMem /= Ki;
if (sysMem >= freeMemory[TotalFreeMemory]) {
freeMemory[TotalFreeMemory] = 0;
@@ -1945,8 +1937,7 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
amd::ScopedLock lk(scratchAlloc_);
uint sb = vgpu->hwRing();
static const uint WaveSizeLimit = ((1 << 21) - 256);
const uint threadSizeLimit =
WaveSizeLimit / info().wavefrontWidth_;
const uint threadSizeLimit = WaveSizeLimit / info().wavefrontWidth_;
if (regNum > threadSizeLimit) {
LogError("Requested private memory is bigger than HW supports!");
regNum = threadSizeLimit;
@@ -1968,9 +1959,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
// Calculate the size of the scratch buffer for a queue
uint32_t numTotalCUs = info().maxComputeUnits_;
uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs;
scratchBuf->size_ =
static_cast<uint64_t>(info().wavefrontWidth_) *
scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
scratchBuf->size_ = static_cast<uint64_t>(info().wavefrontWidth_) * scratchBuf->regNum_ *
numMaxWaves * sizeof(uint32_t);
scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_);
scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi));
// Note: Generic address space setup in HW requires 64KB alignment for scratch
@@ -2280,7 +2270,7 @@ void Device::SrdManager::freeSrdSlot(uint64_t addr) {
void Device::updateAllocedMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) const {
if (free) {
allocedMem[heap] -= size;
} else {
} else {
allocedMem[heap] += size;
}
}
@@ -2337,12 +2327,18 @@ cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorag
return status;
}
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool result = false;
Pal::SetClockModeInput setClockMode = {};
Pal::DeviceClockMode palClockMode = static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
Pal::DeviceClockMode palClockMode =
static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
setClockMode.clockMode = palClockMode;
result = (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))? true : false;
result = (Pal::Result::Success ==
(iDev()->SetClockMode(setClockMode,
reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))
? true
: false;
return result;
}
+50 -48
Просмотреть файл
@@ -49,7 +49,7 @@ class NullDevice : public amd::Device {
bool create(Pal::AsicRevision asicRevision, //!< GPU ASIC revision
Pal::GfxIpLevel ipLevel, //!< GPU ip level
uint xNACKSupported = 0 //!< GPU xNACKSupported
);
);
//! Instantiate a new virtual device
virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) {
@@ -111,11 +111,14 @@ class NullDevice : public amd::Device {
virtual void svmFree(void* ptr) const { return; }
void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
return true;
}
protected:
static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
Pal::AsicRevision asicRevision_; //!< ASIC revision
Pal::GfxIpLevel ipLevel_; //!< Device IP level
@@ -127,7 +130,7 @@ class NullDevice : public amd::Device {
size_t maxTextureSize, //!< Maximum texture size supported in HW
uint numComputeRings, //!< Number of compute rings
uint numExclusiveComputeRings //!< Number of exclusive compute rings
);
);
};
//! Forward declarations
@@ -148,26 +151,22 @@ class ThreadTrace;
#ifndef CL_FILTER_NONE
#define CL_FILTER_NONE 0x1142
#endif
enum class ExclusiveQueueType : uint32_t {
RealTime0 = 0,
RealTime1,
Medium
};
enum class ExclusiveQueueType : uint32_t { RealTime0 = 0, RealTime1, Medium };
class Sampler : public device::Sampler {
public:
//! Constructor
Sampler(const Device& dev) : dev_(dev) {}
Sampler(const Device& dev) : dev_(dev) {}
//! Default destructor for the device memory object
virtual ~Sampler();
//! Creates a device sampler from the OCL sampler state
bool create(uint32_t oclSamplerState //!< OCL sampler state
);
);
//! Creates a device sampler from the OCL sampler state
bool create(const amd::Sampler& owner //!< AMD sampler object
);
);
private:
//! Disable default copy constructor
@@ -216,7 +215,7 @@ class Device : public NullDevice {
//! Releases transfer buffer
void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer
Memory& buffer //!< Transfer buffer for release
);
);
//! Returns the buffer's size for transfer
size_t bufSize() const { return bufSize_; }
@@ -308,7 +307,7 @@ class Device : public NullDevice {
//! Initialise a device (i.e. all parts of the constructor that could
//! potentially fail)
bool create(Pal::IDevice* device //!< PAL device interface object
);
);
//! Destructor for the physical GPU device
virtual ~Device();
@@ -346,7 +345,8 @@ class Device : public NullDevice {
virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object
const device::VirtualDevice* vdev);
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
//! Retrieves information about free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
@@ -398,9 +398,10 @@ class Device : public NullDevice {
//! Returns the number of available compute rings
uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size(); }
//! Returns the map of available exclusive compute rings with the engine index
const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const
{ return exclusiveComputeEnginesId_; }
//! Returns the map of available exclusive compute rings with the engine index
const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const {
return exclusiveComputeEnginesId_;
}
//! Returns the number of available DMA engines
uint numDMAEngines() const { return numDmaEngines_; }
@@ -526,11 +527,8 @@ class Device : public NullDevice {
}
private:
static void PAL_STDCALL PalDeveloperCallback(
void* pPrivateData,
const Pal::uint32 deviceIndex,
Pal::Developer::CallbackType type,
void* pCbData);
static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
Pal::Developer::CallbackType type, void* pCbData);
//! Disable copy constructor
Device(const Device&);
@@ -554,36 +552,37 @@ class Device : public NullDevice {
//! Allocates/reallocates the scratch buffer, according to the usage
bool allocScratch(uint regNum, //!< Number of the scratch registers
const VirtualGPU* vgpu //!< Virtual GPU for the allocation
);
);
//! Interop for D3D devices
bool associateD3D11Device(void* d3d11Device //!< void* is of type ID3D11Device*
);
);
bool associateD3D10Device(void* d3d10Device //!< void* is of type ID3D10Device*
);
);
bool associateD3D9Device(void* d3d9Device //!< void* is of type IDirect3DDevice9*
);
);
//! Interop for GL device
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
static char* platformObj_; //!< Memory allocated for PAL platform object
static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
static char* platformObj_; //!< Memory allocated for PAL platform object
static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
amd::Context* context_; //!< A dummy context for internal allocations
mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
amd::Context* context_; //!< A dummy context for internal allocations
mutable amd::Monitor lockAsyncOps_; //!< Lock to serialise all async ops on this device
//! Lock to serialise all async ops on initialization heap operation
mutable amd::Monitor lockForInitHeap_;
mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
XferBuffers* xferRead_; //!< Transfer buffers read
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
ResourceCache* resourceCache_; //!< Resource cache
uint numComputeEngines_; //!< The number of available compute engines
std::map<ExclusiveQueueType, uint32_t> exclusiveComputeEnginesId_;//!< The number of available compute engines
mutable amd::Monitor lockForInitHeap_;
mutable amd::Monitor lockPAL_; //!< Lock to serialise PAL access
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
mutable amd::Monitor scratchAlloc_; //!< Lock to serialise scratch allocation
mutable amd::Monitor mapCacheOps_; //!< Lock to serialise cache for the map resources
mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
XferBuffers* xferRead_; //!< Transfer buffers read
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
ResourceCache* resourceCache_; //!< Resource cache
uint numComputeEngines_; //!< The number of available compute engines
std::map<ExclusiveQueueType, uint32_t>
exclusiveComputeEnginesId_; //!< The number of available compute engines
uint numDmaEngines_; //!< The number of available compute engines
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
VirtualGPU* xferQueue_; //!< Transfer queue
@@ -594,10 +593,13 @@ class Device : public NullDevice {
mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem
Pal::DeviceProperties properties_; //!< PAL device properties
Pal::IDevice* device_; //!< PAL device object
mutable std::atomic<Pal::gpusize> allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
std::unordered_set<Resource*>* resourceList_; //!< Active resource list
RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
mutable std::atomic<Pal::gpusize>
allocedMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
std::unordered_set<Resource*>* resourceList_; //!< Active resource list
RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
Pal::GpuMemoryHeapProperties
heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+9 -9
Просмотреть файл
@@ -3,19 +3,19 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D10Device(void* d3d10Device) { return false; }
} // pal
} // namespace pal
#else // !ATI_OS_WIN
#include <D3D10_1.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -127,6 +127,6 @@ bool Device::associateD3D10Device(void* d3d10Device) {
return canInteroperate;
}
} // pal
} // namespace pal
#endif // !ATI_OS_WIN
+9 -9
Просмотреть файл
@@ -3,19 +3,19 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D11Device(void* d3d11Device) { return false; }
}
} // namespace pal
#else // !ATI_OS_LINUX
#include <D3D11.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -128,6 +128,6 @@ bool Device::associateD3D11Device(void* d3d11Device) {
return canInteroperate;
}
} // pal
} // namespace pal
#endif // !ATI_OS_LINUX
+9 -9
Просмотреть файл
@@ -3,20 +3,20 @@
#if defined(ATI_OS_LINUX)
namespace pal {
bool Device::associateD3D9Device(void* d3dDevice) { return false; }
}
} // namespace pal
#else // !ATI_OS_LINUX
#include <d3d9.h>
#include <dxgi.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce
*tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface
*classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
@@ -44,5 +44,5 @@ bool Device::associateD3D9Device(void* d3d9Device) {
return canInteroperate;
}
} // pal
} // namespace pal
#endif // !ATI_OS_WIN
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+131 -165
Просмотреть файл
@@ -32,34 +32,27 @@
#include "protocols/rgpServer.h"
#include "protocols/driverControlServer.h"
namespace pal
{
namespace pal {
// ================================================================================================
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
:
device_(device),
dev_driver_server_(platform->GetDevDriverServer()),
user_event_(nullptr),
num_prep_disp_(0),
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
trace_gpu_mem_limit_(0),
global_disp_count_(1), // Must start from 1 according to RGP spec
trace_enabled_(false),
inst_tracing_enabled_(false)
{
: device_(device),
dev_driver_server_(platform->GetDevDriverServer()),
user_event_(nullptr),
num_prep_disp_(0),
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
trace_gpu_mem_limit_(0),
global_disp_count_(1), // Must start from 1 according to RGP spec
trace_enabled_(false),
inst_tracing_enabled_(false) {
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
RgpCaptureMgr::~RgpCaptureMgr()
{
DestroyRGPTracing();
}
RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); }
// ================================================================================================
// Creates the GPU Open Developer Mode manager class.
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
{
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) {
RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);
if (mgr != nullptr && !mgr->Init(platform)) {
@@ -71,8 +64,7 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev
}
// ================================================================================================
bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
{
bool RgpCaptureMgr::Init(Pal::IPlatform* platform) {
if (dev_driver_server_ == nullptr) {
return false;
}
@@ -105,13 +97,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
const uint32_t api_version = settings.oclVersion_;
trace_.gpa_session_ = new GpuUtil::GpaSession(
platform,
device_.iDev(),
api_version >> 4, // OCL API version major
api_version & 0xf, // OCL API version minor
RgpSqttInstrumentationSpecVersion,
RgpSqttInstrumentationApiVersion);
trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(),
api_version >> 4, // OCL API version major
api_version & 0xf, // OCL API version minor
RgpSqttInstrumentationSpecVersion,
RgpSqttInstrumentationApiVersion);
if (trace_.gpa_session_ == nullptr) {
result = false;
@@ -119,7 +109,7 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
}
// Initialize the GPA session
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
result = false;
}
@@ -133,9 +123,9 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
if (!result) {
// If we've failed to initialize tracing, permanently disable traces
if (rgp_server_ != nullptr) {
rgp_server_->DisableTraces();
rgp_server_->DisableTraces();
trace_enabled_ = false;
trace_enabled_ = false;
}
// Clean up if we failed
@@ -150,9 +140,8 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
bool RgpCaptureMgr::RegisterTimedQueue(
uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
{
bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
bool* debug_vmid) const {
bool result = true;
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
@@ -166,8 +155,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
// Register the queue with the GPA session class for timed queue operation support.
if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
if (trace_.gpa_session_->RegisterTimedQueue(
iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
result = false;
}
@@ -175,11 +164,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
}
// ================================================================================================
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
Pal::IQueue* queue,
uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const
{
Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const {
// Fill in extra meta-data information to associate the API command buffer data with
// the generated timing information.
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
@@ -205,8 +191,7 @@ Pal::Result RgpCaptureMgr::TimedQueueSubmit(
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
// This finalizes the developer driver manager.
void RgpCaptureMgr::Finalize()
{
void RgpCaptureMgr::Finalize() {
// Figure out if the gfxip supports tracing. We decide tracing if there is at least one
// enumerated GPU that can support tracing. Since we don't yet know if that GPU will be
// picked as the target of an eventual VkDevice, this check is imperfect.
@@ -215,8 +200,8 @@ void RgpCaptureMgr::Finalize()
bool hw_support_tracing = false;
if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
if (GpuSupportsTracing(device_.properties(), device_.settings())) {
hw_support_tracing = true;
if (GpuSupportsTracing(device_.properties(), device_.settings())) {
hw_support_tracing = true;
}
}
@@ -234,20 +219,18 @@ void RgpCaptureMgr::Finalize()
// ================================================================================================
// Waits for the driver to be resumed if it's currently paused.
void RgpCaptureMgr::WaitForDriverResume()
{
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
void RgpCaptureMgr::WaitForDriverResume() {
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
assert(pDriverControlServer != nullptr);
assert(pDriverControlServer != nullptr);
pDriverControlServer->WaitForDriverResume();
pDriverControlServer->WaitForDriverResume();
}
// ================================================================================================
// Called before a swap chain presents. This signals a frame-end boundary and
// is used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
{
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
if (rgp_server_->TracesEnabled()) {
// If there's currently a trace running, submit the trace-end command buffer
if (trace_.status_ == TraceStatus::Running) {
@@ -257,8 +240,7 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
Pal::Result res = EndRGPHardwareTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == res) {
// continue until we find the right queue...
}
else if (Pal::Result::Success == res) {
} else if (Pal::Result::Success == res) {
trace_.sqtt_disp_count_ = 0;
} else {
FinishRGPTrace(gpu, true);
@@ -272,43 +254,42 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
// Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
//Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
//assert(result == Pal::Result::Success);
// Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
// assert(result == Pal::Result::Success);
}
}
}
// ================================================================================================
Pal::Result RgpCaptureMgr::CheckForTraceResults()
{
Pal::Result RgpCaptureMgr::CheckForTraceResults() {
assert(trace_.status_ == TraceStatus::WaitingForResults);
Pal::Result result = Pal::Result::NotReady;
// Check if trace results are ready
if (trace_.gpa_session_->IsReady() && // GPA session is ready
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
if (trace_.gpa_session_->IsReady() && // GPA session is ready
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
{
bool success = false;
// Fetch required trace data size from GPA session
size_t traceDataSize = 0;
void* pTraceData = nullptr;
void* pTraceData = nullptr;
trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);
// Allocate memory for trace data
if (traceDataSize > 0) {
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
}
if (pTraceData != nullptr) {
// Get trace data from GPA session
if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
Pal::Result::Success) {
Pal::Result::Success) {
// Transmit trace data to anyone who's listening
auto devResult = rgp_server_->WriteTraceData(
static_cast<Pal::uint8*>(pTraceData), traceDataSize);
auto devResult =
rgp_server_->WriteTraceData(static_cast<Pal::uint8*>(pTraceData), traceDataSize);
success = (devResult == DevDriver::Result::Success);
}
@@ -317,7 +298,7 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
}
if (success) {
result = Pal::Result::Success;
result = Pal::Result::Success;
}
}
@@ -327,9 +308,8 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
// ================================================================================================
// Called after a swap chain presents. This signals a (next) frame-begin boundary and is
// used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
size_t x, size_t y, size_t z)
{
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y,
size_t z) {
// Wait for the driver to be resumed in case it's been paused.
WaitForDriverResume();
@@ -347,8 +327,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
}
}
}
}
else if (trace_.status_ == TraceStatus::Preparing) {
} else if (trace_.status_ == TraceStatus::Preparing) {
// Wait some number of "preparation frames" before starting the trace in order to get enough
// timer samples to sync CPU/GPU clock domains.
trace_.prepared_disp_count_++;
@@ -370,7 +349,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
// Check if we're ending a trace waiting for SQTT to turn off.
// If SQTT has turned off, end the trace
else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
Pal::Result result = Pal::Result::Success;
Pal::Result result = Pal::Result::Success;
if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
result = EndRGPTrace(gpu);
@@ -401,14 +380,17 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
if (kernel.prog().isInternal()) {
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
RgpSqttMarkerEventType::CmdCopyBufferToImage,
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
RgpSqttMarkerEventType::CmdScheduler
};
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
RgpSqttMarkerEventType::CmdCopyBufferToImage,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdFillBuffer,
RgpSqttMarkerEventType::CmdFillImage,
RgpSqttMarkerEventType::CmdScheduler};
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
if (kernel.name().compare(BlitName[i]) == 0) {
apiEvent = ApiEvents[i];
@@ -418,8 +400,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
}
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
// Write disaptch marker
WriteEventWithDimsMarker(gpu, apiEvent,
static_cast<uint32_t>(x), static_cast<uint32_t>(y), static_cast<uint32_t>(z));
WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
static_cast<uint32_t>(z));
}
}
@@ -428,11 +410,11 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
// ================================================================================================
// This function starts preparing for an RGP trace. Preparation involves some N frames of
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock
// domains.
//
// This function transitions from the Idle state to the Preparing state.
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
{
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Idle);
// We can only trace using a single device at a time currently, so recreate RGP trace
@@ -441,32 +423,32 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
const auto traceParameters = rgp_server_->QueryTraceParameters();
num_prep_disp_ = traceParameters.captureStartIndex;
num_prep_disp_ = traceParameters.captureStartIndex;
uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
// Validate if the captured dispatches are in the range
if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
max_sqtt_disp_ = capture_disp;
}
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;
// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
result = Pal::Result::ErrorUnknown;
result = Pal::Result::ErrorUnknown;
}
// Tell the GPA session class we're starting a trace
if (result == Pal::Result::Success) {
GpuUtil::GpaSessionBeginInfo info = {};
info.flags.enableQueueTiming = true;// trace_.queueTimingEnabled;
info.flags.enableQueueTiming = true; // trace_.queueTimingEnabled;
result = trace_.gpa_session_->Begin(info);
}
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
// Sample the timing clocks prior to starting a trace.
if (result == Pal::Result::Success) {
@@ -476,7 +458,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
if (result == Pal::Result::Success) {
// Remember which queue started the trace
trace_.prepare_queue_ = gpu;
trace_.begin_queue_ = nullptr;
trace_.begin_queue_ = nullptr;
trace_.status_ = TraceStatus::Preparing;
} else {
@@ -497,8 +479,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
// the "begin trace" information command buffer.
//
// This function transitions from the Preparing state to the Running state.
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
{
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Preparing);
assert(trace_enabled_);
@@ -526,8 +507,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// Fill GPU commands
gpu->eventBegin(MainEngine);
trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
gpu->queue(MainEngine).iCmd(), sampleConfig);
trace_.gpa_sample_id_ =
trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig);
gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
}
@@ -540,7 +521,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// Make the trace active and remember which queue started it
if (result == Pal::Result::Success) {
trace_.status_ = TraceStatus::Running;
trace_.status_ = TraceStatus::Running;
trace_.begin_queue_ = gpu;
}
@@ -551,8 +532,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
// This function submits the command buffer to stop SQTT tracing. Full tracing still continues.
//
// This function transitions from the Running state to the WaitingForSqtt state.
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
{
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Running);
Pal::Result result = Pal::Result::Success;
@@ -593,8 +573,7 @@ Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
// This function ends a running RGP trace.
//
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
{
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::WaitingForSqtt);
Pal::Result result = Pal::Result::Success;
@@ -629,8 +608,7 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
// ================================================================================================
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
// It frees any dependent resources.
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
{
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
if (trace_.prepare_queue_ == nullptr) {
return;
}
@@ -654,26 +632,25 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
// Reset tracing state to idle
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
trace_.gpa_sample_id_ = 0;
trace_.status_ = TraceStatus::Idle;
trace_.prepare_queue_ = nullptr;
trace_.begin_queue_ = nullptr;
trace_.sqtt_disp_count_ = 0;
trace_.gpa_sample_id_ = 0;
trace_.status_ = TraceStatus::Idle;
trace_.prepare_queue_ = nullptr;
trace_.begin_queue_ = nullptr;
}
// ================================================================================================
// Destroys device-persistent RGP resources
void RgpCaptureMgr::DestroyRGPTracing()
{
void RgpCaptureMgr::DestroyRGPTracing() {
if (trace_.status_ != TraceStatus::Idle) {
FinishRGPTrace(nullptr, true);
FinishRGPTrace(nullptr, true);
}
delete user_event_;
// Destroy the GPA session
if (trace_.gpa_session_ != nullptr) {
//Util::Destructor(trace_.gpa_session_);
// Util::Destructor(trace_.gpa_session_);
delete trace_.gpa_session_;
trace_.gpa_session_ = nullptr;
}
@@ -683,18 +660,15 @@ void RgpCaptureMgr::DestroyRGPTracing()
// ================================================================================================
// Returns true if the given device properties/settings support tracing.
bool RgpCaptureMgr::GpuSupportsTracing(
const Pal::DeviceProperties& props,
const Settings& settings)
{
bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props,
const Settings& settings) {
return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
}
// ================================================================================================
// Called when a new device is created. This will preallocate reusable RGP trace resources
// for that device.
void RgpCaptureMgr::PostDeviceCreate()
{
void RgpCaptureMgr::PostDeviceCreate() {
amd::ScopedLock traceLock(&trace_mutex_);
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
@@ -714,8 +688,7 @@ void RgpCaptureMgr::PostDeviceCreate()
// ================================================================================================
// Called prior to a device's being destroyed. This will free persistent RGP trace resources for
// that device.
void RgpCaptureMgr::PreDeviceDestroy()
{
void RgpCaptureMgr::PreDeviceDestroy() {
amd::ScopedLock traceLock(&trace_mutex_);
// If we are idle, we can re-initialize trace resources based on the new device.
if (trace_.status_ == TraceStatus::Idle) {
@@ -725,9 +698,8 @@ void RgpCaptureMgr::PreDeviceDestroy()
// ================================================================================================
// Sets up an Event marker's basic data.
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
{
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu,
RgpSqttMarkerEventType api_type) const {
RgpSqttMarkerEvent marker = {};
marker.identifier = RgpSqttMarkerIdentifierEvent;
@@ -739,24 +711,19 @@ RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
}
// ================================================================================================
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
{
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const {
assert((data_size % sizeof(uint32_t)) == 0);
assert((data_size / sizeof(uint32_t)) > 0);
gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
gpu->queue(MainEngine)
.iCmd()
->CmdInsertRgpTraceMarker(static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
}
// ================================================================================================
// Inserts an RGP pre-dispatch marker
void RgpCaptureMgr::WriteEventWithDimsMarker(
const VirtualGPU* gpu,
RgpSqttMarkerEventType apiType,
uint32_t x,
uint32_t y,
uint32_t z) const
{
void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
uint32_t x, uint32_t y, uint32_t z) const {
assert(apiType != RgpSqttMarkerEventType::Invalid);
RgpSqttMarkerEventWithDims eventWithDims = {};
@@ -771,26 +738,24 @@ void RgpCaptureMgr::WriteEventWithDimsMarker(
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierStartMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
RgpSqttMarkerBarrierStart marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.dword02 = data.reason;
marker.internal = true;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.dword02 = data.reason;
marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierEndMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
// Copy the operations part and include the same data from previous markers
@@ -799,28 +764,28 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
auto operations = data.operations;
operations.pipelineStalls.u16All |= 0;
operations.caches.u16All |= 0;
operations.caches.u16All |= 0;
RgpSqttMarkerBarrierEnd marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
marker.invalTcp = operations.caches.invalTcp;
marker.invalSqI = operations.caches.invalSqI$;
marker.invalSqK = operations.caches.invalSqK$;
marker.flushTcc = operations.caches.flushTcc;
marker.invalTcc = operations.caches.invalTcc;
marker.flushCb = operations.caches.flushCb;
marker.invalCb = operations.caches.invalCb;
marker.flushDb = operations.caches.flushDb;
marker.invalDb = operations.caches.invalDb;
marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
marker.invalTcp = operations.caches.invalTcp;
marker.invalSqI = operations.caches.invalSqI$;
marker.invalSqK = operations.caches.invalSqK$;
marker.flushTcc = operations.caches.flushTcc;
marker.invalTcc = operations.caches.invalTcc;
marker.flushCb = operations.caches.flushCb;
marker.invalCb = operations.caches.invalCb;
marker.flushDb = operations.caches.flushDb;
marker.invalDb = operations.caches.invalDb;
marker.numLayoutTransitions = 0;
@@ -830,9 +795,9 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
// ================================================================================================
// Inserts a user event string marker
void RgpCaptureMgr::WriteUserEventMarker(
const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
{
void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu,
RgpSqttMarkerUserEventType eventType,
const std::string& name) const {
memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));
user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
@@ -841,7 +806,8 @@ void RgpCaptureMgr::WriteUserEventMarker(
size_t markerSize = sizeof(user_event_->header);
if ((eventType != RgpSqttMarkerUserEventPop)) {
size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
size_t strLength =
std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
uint32_t c = static_cast<uint32_t>(name[charIdx]);
user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
@@ -859,4 +825,4 @@ void RgpCaptureMgr::WriteUserEventMarker(
}
}; // namespace vk
}; // namespace pal
+140 -170
Просмотреть файл
@@ -34,42 +34,36 @@
#include "gpuopen.h"
// PAL forward declarations
namespace Pal
{
class ICmdBuffer;
class IFence;
class IQueueSemaphore;
namespace Pal {
class ICmdBuffer;
class IFence;
class IQueueSemaphore;
struct PalPublicSettings;
}
} // namespace Pal
// GpuUtil forward declarations
namespace GpuUtil
{
namespace GpuUtil {
class GpaSession;
};
// GPUOpen forward declarations
namespace DevDriver
{
namespace DevDriver {
class DevDriverServer;
class IMsgChannel;
struct MessageBuffer;
namespace DriverControlProtocol
{
namespace DriverControlProtocol {
enum struct DeviceClockMode : uint32_t;
class HandlerServer;
}
} // namespace DriverControlProtocol
namespace SettingsProtocol
{
namespace SettingsProtocol {
class HandlerServer;
}
}
} // namespace DevDriver
namespace pal
{
namespace pal {
class Settings;
class Device;
class VirtualGPU;
@@ -77,8 +71,7 @@ class HSAILKernel;
// ================================================================================================
// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
enum RgpSqttMarkerIdentifier : uint32_t
{
enum RgpSqttMarkerIdentifier : uint32_t {
RgpSqttMarkerIdentifierEvent = 0x0,
RgpSqttMarkerIdentifierCbStart = 0x1,
RgpSqttMarkerIdentifierCbEnd = 0x2,
@@ -98,8 +91,7 @@ enum RgpSqttMarkerIdentifier : uint32_t
};
// ================================================================================================
enum class RgpSqttMarkerEventType : uint32_t
{
enum class RgpSqttMarkerEventType : uint32_t {
CmdNDRangeKernel = 0,
CmdScheduler = 1,
CmdCopyBuffer = 2,
@@ -114,8 +106,7 @@ enum class RgpSqttMarkerEventType : uint32_t
};
// ================================================================================================
enum class RgpSqqtBarrierReason : uint32_t
{
enum class RgpSqqtBarrierReason : uint32_t {
Invalid = 0,
MemDependency = 0xC0000000,
ProfilingControl = 0xC0000001,
@@ -125,129 +116,116 @@ enum class RgpSqqtBarrierReason : uint32_t
};
// ================================================================================================
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
// i.e. draws/dispatches (Table 4).
struct RgpSqttMarkerEvent
{
union
{
struct
{
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t apiType : 24; // The API type for this command
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
struct RgpSqttMarkerEvent {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t apiType : 24; // The API type for this command
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
};
uint32_t dword01; // The first dword
uint32_t dword01; // The first dword
};
union
{
// Some information about the vertex/instance/draw register indices. These values are not
union {
// Some information about the vertex/instance/draw register indices. These values are not
// always valid because they are not available for one reason or another:
//
// - If vertex offset index or instance offset index are not (together) valid, they are both
// equal to 0
// - If draw index is not valid, it is equal to the vertex offset index
struct
{
uint32_t cbID : 20; // Command buffer ID for this marker
struct {
uint32_t cbID : 20; // Command buffer ID for this marker
uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset
uint32_t instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw indirect)
uint32_t
instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw
// indirect)
};
uint32_t dword02; // The second dword
uint32_t dword02; // The second dword
};
union
{
uint32_t cmdID; // Command index within the command buffer
uint32_t dword03; // The third dword
union {
uint32_t cmdID; // Command index within the command buffer
uint32_t dword03; // The third dword
};
};
// ================================================================================================
// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
struct RgpSqttMarkerEventWithDims
{
RgpSqttMarkerEvent event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
uint32_t threadX; // Work group count in X
uint32_t threadY; // Work group count in Y
uint32_t threadZ; // Work group count in Z
struct RgpSqttMarkerEventWithDims {
RgpSqttMarkerEvent
event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
uint32_t threadX; // Work group count in X
uint32_t threadY; // Work group count in Y
uint32_t threadZ; // Work group count in Z
};
// ================================================================================================
// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
struct RgpSqttMarkerBarrierStart
{
union
{
struct
{
struct RgpSqttMarkerBarrierStart {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t reserved : 5; // Reserved
};
uint32_t dword01; // The first dword
uint32_t dword01; // The first dword
};
union
{
struct
{
union {
struct {
uint32_t driverReason : 31;
uint32_t internal: 1;
uint32_t internal : 1;
};
uint32_t dword02; // The second dword
uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
struct RgpSqttMarkerBarrierEnd
{
union
{
struct
{
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that timestamp
// to be written. Quintessential full pipeline stall.
struct RgpSqttMarkerBarrierEnd {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
// timestamp to be written. Quintessential full pipeline stall.
uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete.
uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete.
uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete.
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
};
uint32_t dword01; // The first dword
uint32_t dword01; // The first dword
};
union
{
struct
{
uint32_t syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
union {
struct {
uint32_t
syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
uint32_t invalTcp : 1; // Invalidate the L1 vector caches.
uint32_t invalSqI : 1; // Invalidate the SQ instruction caches
uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches)
uint32_t flushTcc : 1; // Flush L2
uint32_t invalTcc : 1; // Invalidate L2
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
uint32_t flushDb : 1; // Flush DB caches (including htile)
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
uint32_t reserved : 6; // Reserved for future expansion. Always 0
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
uint32_t flushDb : 1; // Flush DB caches (including htile)
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
uint32_t reserved : 6; // Reserved for future expansion. Always 0
};
uint32_t dword02; // The second dword
uint32_t dword02; // The second dword
};
};
@@ -255,33 +233,31 @@ struct RgpSqttMarkerBarrierEnd
constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;
// RGP SQTT Instrumentation Specification version for Vulkan-specific tables
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user event
enum RgpSqttMarkerUserEventType : uint32_t
{
RgpSqttMarkerUserEventTrigger = 0x0,
RgpSqttMarkerUserEventPop = 0x1,
RgpSqttMarkerUserEventPush = 0x2,
RgpSqttMarkerUserEventObjectName = 0x3,
RgpSqttMarkerUserEventReserved1 = 0x4,
RgpSqttMarkerUserEventReserved2 = 0x5,
RgpSqttMarkerUserEventReserved3 = 0x6,
RgpSqttMarkerUserEventReserved4 = 0x7,
// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
// event
enum RgpSqttMarkerUserEventType : uint32_t {
RgpSqttMarkerUserEventTrigger = 0x0,
RgpSqttMarkerUserEventPop = 0x1,
RgpSqttMarkerUserEventPush = 0x2,
RgpSqttMarkerUserEventObjectName = 0x3,
RgpSqttMarkerUserEventReserved1 = 0x4,
RgpSqttMarkerUserEventReserved2 = 0x5,
RgpSqttMarkerUserEventReserved3 = 0x6,
RgpSqttMarkerUserEventReserved4 = 0x7,
};
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
union RgpSqttMarkerUserEvent
{
struct
{
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 8; // Number of extra dwords following this marker
uint32_t dataType : 8; // The type for this marker
uint32_t reserved : 12; // reserved
};
union RgpSqttMarkerUserEvent {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 8; // Number of extra dwords following this marker
uint32_t dataType : 8; // The type for this marker
uint32_t reserved : 12; // reserved
};
uint32_t dword01; // The first dword
uint32_t dword01; // The first dword
};
constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
@@ -289,21 +265,20 @@ constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
// The max lengths of frame marker strings
static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and trigger data types)
struct RgpSqttMarkerUserEventWithString
{
RgpSqttMarkerUserEvent header;
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
// trigger data types)
struct RgpSqttMarkerUserEventWithString {
RgpSqttMarkerUserEvent header;
uint32_t stringLength; // Length of the string (in characters)
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
uint32_t stringLength; // Length of the string (in characters)
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
};
// ================================================================================================
// This class provides functionality to interact with the GPU Open Developer Mode message passing
// service and the rest of the driver.
class RgpCaptureMgr
{
public:
class RgpCaptureMgr {
public:
~RgpCaptureMgr();
static RgpCaptureMgr* Create(Pal::IPlatform* platform, const Device& device);
@@ -321,45 +296,42 @@ public:
bool IsQueueTimingActive() const;
void WriteBarrierStartMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
void WriteBarrierEndMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
void WriteBarrierStartMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const;
void WriteBarrierEndMarker(const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const;
Pal::Result TimedQueueSubmit(
Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const;
private:
private:
// Steps that an RGP trace goes through
enum class TraceStatus
{
Idle = 0, // No active trace and none requested
Preparing, // A trace has been requested but is not active yet because we are
// currently sampling timing information over some number of lead frames.
Running, // SQTT and queue timing is currently active for all command buffer submits.
WaitingForSqtt,
WaitingForResults // Tracing is no longer active, but all results are not yet ready.
enum class TraceStatus {
Idle = 0, // No active trace and none requested
Preparing, // A trace has been requested but is not active yet because we are
// currently sampling timing information over some number of lead frames.
Running, // SQTT and queue timing is currently active for all command buffer submits.
WaitingForSqtt,
WaitingForResults // Tracing is no longer active, but all results are not yet ready.
};
// All per-device state to support RGP tracing
struct TraceState
{
TraceStatus status_; // Current trace status (idle, running, etc.)
struct TraceState {
TraceStatus status_; // Current trace status (idle, running, etc.)
GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
GpuEvent begin_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
GpuEvent end_sqtt_event_; // Event that is signaled when a trace-end cmdbuf retires
GpuEvent end_event_; // Event that is signaled when a trace-end cmdbuf retires
VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
VirtualGPU* prepare_queue_; // The queue that triggered the full start of a trace
VirtualGPU* begin_queue_; // The queue that triggered starting SQTT
GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
uint32_t gpa_sample_id_; // Sample ID associated with the current trace
bool queue_timing_; // Queue timing is enabled
GpuUtil::GpaSession* gpa_session_; // GPA session helper object for building RGP data
uint32_t gpa_sample_id_; // Sample ID associated with the current trace
bool queue_timing_; // Queue timing is enabled
uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
mutable uint32_t current_event_id_; // Current event ID
uint32_t prepared_disp_count_; // Number of dispatches counted while preparing for a trace
uint32_t sqtt_disp_count_; // Number of dispatches counted while SQTT tracing is active
mutable uint32_t current_event_id_; // Current event ID
};
RgpCaptureMgr(Pal::IPlatform* platform, const Device& device);
@@ -374,25 +346,25 @@ private:
static bool GpuSupportsTracing(const Pal::DeviceProperties& props, const Settings& settings);
RgpSqttMarkerEvent BuildEventMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const;
void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const;
void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
uint32_t x, uint32_t y, uint32_t z) const;
void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x,
uint32_t y, uint32_t z) const;
void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType,
const std::string& name) const;
const std::string& name) const;
const Device& device_;
const Device& device_;
DevDriver::DevDriverServer* dev_driver_server_;
DevDriver::RGPProtocol::RGPServer* rgp_server_;
mutable amd::Monitor trace_mutex_;
TraceState trace_;
mutable amd::Monitor trace_mutex_;
TraceState trace_;
RgpSqttMarkerUserEventWithString* user_event_;
uint32_t num_prep_disp_;
uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
uint32_t trace_gpu_mem_limit_;
uint32_t global_disp_count_;
uint32_t num_prep_disp_;
uint32_t max_sqtt_disp_; // Maximum number of the dispatches allowed in the trace
uint32_t trace_gpu_mem_limit_;
uint32_t global_disp_count_;
bool trace_enabled_; // True if tracing is currently enabled (master flag)
bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
bool trace_enabled_; // True if tracing is currently enabled (master flag)
bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens
PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr);
PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr);
@@ -400,11 +372,9 @@ private:
// ================================================================================================
// Returns true if queue operations are currently being timed by RGP traces.
inline bool RgpCaptureMgr::IsQueueTimingActive() const
{
inline bool RgpCaptureMgr::IsQueueTimingActive() const {
return (trace_.queue_timing_ &&
(trace_.status_ == TraceStatus::Running ||
trace_.status_ == TraceStatus::Preparing ||
(trace_.status_ == TraceStatus::Running || trace_.status_ == TraceStatus::Preparing ||
trace_.status_ == TraceStatus::WaitingForSqtt));
}
};
}; // namespace pal
+42 -56
Просмотреть файл
@@ -27,11 +27,9 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
namespace pal {
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
const uint32_t groupSegmentSize,
const uint16_t numSGPRs,
const uint32_t groupSegmentSize, const uint16_t numSGPRs,
const uint16_t numVGPRs) {
workGroupInfo_.scratchRegs_ =
amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
workGroupInfo_.privateMemSize_ = privateSegmentSize;
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
workGroupInfo_.usedSGPRs_ = numSGPRs;
@@ -63,13 +61,13 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t
}
// Copy code object of this kernel from the program CPU segment
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)),
sizeof(amd_kernel_code_t));
return true;
}
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
amd_kernel_code_t* akc = &akc_;
if (!setKernelCode(sym, akc)) {
@@ -77,18 +75,16 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
}
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
reinterpret_cast<void*>(&codeSize_))) {
reinterpret_cast<void*>(&codeSize_))) {
return false;
}
// Setup the the workgroup info
setWorkGroupInfo(akc->workitem_private_segment_byte_size,
akc->workgroup_group_segment_byte_size,
akc->wavefront_sgpr_count,
akc->workitem_vgpr_count);
// Setup the the workgroup info
setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
akc->wavefront_sgpr_count, akc->workitem_vgpr_count);
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
return true;
@@ -102,16 +98,14 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
codeSize_(0),
workgroupGroupSegmentByteSize_(0),
kernargSegmentByteSize_(0),
spillSegmentByteSize_(0)
{
spillSegmentByteSize_(0) {
flags_.hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
}
HSAILKernel::~HSAILKernel() {}
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
#if defined(WITH_COMPILER_LIB)
#if defined(WITH_COMPILER_LIB)
acl_error error = ACL_SUCCESS;
std::string openClKernelName = openclMangledName(name());
flags_.internalKernel_ =
@@ -274,12 +268,14 @@ const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast<const HSAILProgram&>(prog_);
}
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
const amd::NDRangeContainer& sizes,
const_address parameters,
size_t ldsAddress, uint64_t vmDefQueue,
uint64_t* vmParentWrap) const {
uint64_t argList;
address aqlArgBuf = gpu.managedBuffer().reserve(
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
if (dynamicParallelism()) {
@@ -307,8 +303,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
if (sizes.dimensions() >= 2) {
offset = sizes.offset()[1];
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
offset = sizes.offset()[1];
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
@@ -322,8 +318,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
// and set the fourth argument as the printf_buffer pointer
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
dbgBuffer()->vmAddress());
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
}
@@ -346,11 +341,11 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// Note: In a case of structs the size won't match,
// since HSAIL compiler expects a reference...
assert(argsBufferSize() <= signature.paramsSize() &&
"A mismatch of sizes of arguments between compiler and runtime!");
"A mismatch of sizes of arguments between compiler and runtime!");
//hsa_kernel_dispatch_packet_t disp;
hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
gpu.cb(0)->SysMemCopy());
// hsa_kernel_dispatch_packet_t disp;
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
@@ -359,10 +354,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
constexpr uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
@@ -387,7 +382,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
gpu.addVmMemory(gpu.hsaQueueMem());
}
@@ -407,7 +402,7 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
return nullptr;
}
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
@@ -419,7 +414,7 @@ bool LightningKernel::init() {
return false;
}
KernelMD kernelMD;
KernelMD kernelMD;
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
@@ -427,8 +422,8 @@ bool LightningKernel::init() {
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
sizeof(uint32_t));
spillSegmentByteSize_ =
amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, sizeof(uint32_t));
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
// Copy codeobject of this kernel from the program CPU segment
@@ -451,7 +446,7 @@ bool LightningKernel::init() {
// Get the runtime handle symbol GPU address
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
const_cast<hsa_agent_t*>(&agent));
const_cast<hsa_agent_t*>(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
@@ -461,19 +456,14 @@ bool LightningKernel::init() {
uint64_t kernel_object = gpuAqlCode();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
const struct RuntimeHandle runtime_handle = {
gpuAqlCode(),
spillSegSize(),
ldsSize()
};
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
// Setup the the workgroup info
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
kernelMD.mCodeProps.mGroupSegmentFixedSize,
kernelMD.mCodeProps.mNumSGPRs,
kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
kernelMD.mCodeProps.mNumVGPRs);
// Copy wavefront size
@@ -499,10 +489,10 @@ bool LightningKernel::init() {
return true;
}
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(USE_COMGR_LIBRARY)
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
@@ -545,7 +535,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
// Get the runtime handle symbol GPU address
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
const_cast<hsa_agent_t*>(&agent));
const_cast<hsa_agent_t*>(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
@@ -554,11 +544,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
const struct RuntimeHandle runtime_handle = {
gpuAqlCode(),
spillSegSize(),
ldsSize()
};
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
@@ -584,7 +570,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
waveLimiter_.enable();
*/
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
return true;
}
+17 -19
Просмотреть файл
@@ -20,14 +20,14 @@ namespace amd {
namespace hsa {
namespace loader {
class Symbol;
} // loader
} // namespace loader
namespace code {
namespace Kernel {
class Metadata;
} // Kernel
} // code
} // hsa
} // amd
} // namespace Kernel
} // namespace code
} // namespace hsa
} // namespace amd
//! \namespace pal PAL Device Implementation
namespace pal {
@@ -43,7 +43,6 @@ class LightningProgram;
*/
class HSAILKernel : public device::Kernel {
public:
HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions);
virtual ~HSAILKernel();
@@ -106,21 +105,19 @@ class HSAILKernel : public device::Kernel {
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
//! Set up the workgroup info based on the kernel metadata
void setWorkGroupInfo(const uint32_t privateSegmentSize,
const uint32_t groupSegmentSize,
const uint16_t numSGPRs,
const uint16_t numVGPRs);
void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize,
const uint16_t numSGPRs, const uint16_t numVGPRs);
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
uint index_; //!< Kernel index in the program
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
uint index_; //!< Kernel index in the program
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
};
class LightningKernel : public HSAILKernel {
@@ -140,4 +137,5 @@ class LightningKernel : public HSAILKernel {
#endif
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+12 -18
Просмотреть файл
@@ -23,27 +23,21 @@
namespace pal {
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size)
: device::Memory(owner), Resource(gpuDev, size)
, pinnedMemory_(nullptr)
, parent_(nullptr) {
: device::Memory(owner), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(const Device& gpuDev, size_t size)
: device::Memory(size), Resource(gpuDev, size)
, pinnedMemory_(nullptr)
, parent_(nullptr) {
}
: device::Memory(size), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {}
Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth,
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
: device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
, pinnedMemory_(nullptr)
, parent_(nullptr) {
: device::Memory(owner),
Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
pinnedMemory_(nullptr),
parent_(nullptr) {
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
@@ -51,10 +45,10 @@ Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t he
Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth,
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
: device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
, pinnedMemory_(nullptr)
, parent_(nullptr) {
}
: device::Memory(size),
Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
pinnedMemory_(nullptr),
parent_(nullptr) {}
#ifdef _WIN32
static HANDLE getSharedHandle(IUnknown* pIface) {
@@ -130,7 +124,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
break;
case Resource::Remote:
case Resource::RemoteUSWC:
if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
@@ -402,7 +396,7 @@ Memory::~Memory() {
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
// Note: runtime will perform unmap on the actual resource destruction
//unmap(nullptr);
// unmap(nullptr);
}
}
+17 -15
Просмотреть файл
@@ -32,12 +32,12 @@ class Memory : public device::Memory, public Resource {
Memory(const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t size //!< Memory size for allocation
);
);
//! Constructor (nonfat version for local scratch mem use without heap block)
Memory(const Device& gpuDev, //!< GPU device object
size_t size //!< Memory size for allocation
);
);
//! Constructor memory for images (without global heap allocation)
Memory(const Device& gpuDev, //!< GPU device object
@@ -48,7 +48,7 @@ class Memory : public device::Memory, public Resource {
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
);
);
//! Constructor memory for images (without global heap allocation)
Memory(const Device& gpuDev, //!< GPU device object
@@ -59,7 +59,7 @@ class Memory : public device::Memory, public Resource {
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
);
);
//! Default destructor
~Memory();
@@ -70,7 +70,7 @@ class Memory : public device::Memory, public Resource {
//! Overloads the resource create method
virtual bool create(Resource::MemoryType memType, //!< Memory type
Resource::CreateParams* params = NULL //!< Prameters for create
);
);
//! Allocate memory for API-level maps
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
@@ -78,12 +78,12 @@ class Memory : public device::Memory, public Resource {
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
);
//! Pins system memory associated with this memory object
virtual bool pinSystemMemory(void* hostPtr, //!< System memory address
size_t size //!< Size of allocated system memory
);
);
//! Releases indirect map surface
virtual void releaseIndirectMap() { decIndMapCount(); }
@@ -96,15 +96,15 @@ class Memory : public device::Memory, public Resource {
uint numLayers = 0, //!< End layer for multilayer map
size_t* rowPitch = NULL, //!< Row pitch for the device memory
size_t* slicePitch = NULL //!< Slice pitch for the device memory
);
);
//! Unmap the device memory
virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
);
);
//! Updates device memory from the owner's host allocation
void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object
//! Synchronization flags
//! Synchronization flags
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags());
//! Updates the owner's host allocation from device memory
@@ -115,11 +115,13 @@ class Memory : public device::Memory, public Resource {
//! Creates a view from current resource
virtual Memory* createBufferView(
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
);
);
virtual uint64_t virtualAddress() const override { return vmAddress(); }
virtual const address cpuSrd() const { return reinterpret_cast<const address>(const_cast<void*>(hwState())); }
virtual const address cpuSrd() const {
return reinterpret_cast<const address>(const_cast<void*>(hwState()));
}
//! Allocates host memory for synchronization with MGPU context
void mgpuCacheWriteBack();
@@ -161,8 +163,8 @@ class Memory : public device::Memory, public Resource {
//! Disable operator=
Memory& operator=(const Memory&);
Memory* pinnedMemory_; //!< Memory used as pinned system memory
const Memory* parent_; //!< Parent memory object
Memory* pinnedMemory_; //!< Memory used as pinned system memory
const Memory* parent_; //!< Parent memory object
};
class Buffer : public pal::Memory {
@@ -219,7 +221,7 @@ class Image : public pal::Memory {
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
);
virtual uint64_t virtualAddress() const override { return hwSrd(); }
+18 -17
Просмотреть файл
@@ -11,7 +11,7 @@
#ifndef isinf
#ifdef _MSC_VER
#define isinf(X) (!_finite(X) && !_isnan(X))
#else //!_MSC_VER
#else //!_MSC_VER
#define isinf(X) (std::isinf(X))
#endif //!_MSC_VER
#endif // isinf
@@ -19,7 +19,7 @@
#ifndef isnan
#ifdef _MSC_VER
#define isnan(X) (_isnan(X))
#else //!_MSC_VER
#else //!_MSC_VER
#define isnan(X) (std::isnan(X))
#endif //!_MSC_VER
#endif // isnan
@@ -55,14 +55,14 @@ class PrintfDbg : public amd::HeapObject {
bool init(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size //!< Kernel's workload
);
);
//! Prints the kernel's debug informaiton from the buffer
bool output(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size, //!< Kernel's workload
bool output(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size, //!< Kernel's workload
const std::vector<device::PrintfInfo>& printfInfo //!< printf info
);
);
//! Debug buffer size per workitem
size_t wiDbgSize() const { return wiDbgSize_; }
@@ -81,7 +81,7 @@ class PrintfDbg : public amd::HeapObject {
//! Allocates the debug buffer
bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory
);
);
//! Returns TRUE if a float value has to be printed
bool checkFloat(const std::string& fmt //!< Format string
@@ -105,9 +105,9 @@ class PrintfDbg : public amd::HeapObject {
) const;
//! Displays the PrintfDbg
void outputDbgBuffer(const device::PrintfInfo& info,//!< printf info
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
size_t& i //!< index to the data in the buffer
void outputDbgBuffer(const device::PrintfInfo& info, //!< printf info
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
size_t& i //!< index to the data in the buffer
) const;
private:
@@ -127,7 +127,7 @@ class PrintfDbg : public amd::HeapObject {
uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
size_t idx, //!< Workitem global index
bool* realloc //!< Returns TRUE if workitem reached the buffer limit
);
);
//! Unamp the staged buffer
void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object
@@ -145,13 +145,13 @@ class PrintfDbgHSA : public PrintfDbg {
//! Initializes the debug buffer before kernel's execution
bool init(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled //!< checks for printf
);
);
//! Prints the kernel's debug informaiton from the buffer
bool output(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
bool output(VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const std::vector<device::PrintfInfo>& printfInfo //!< printf info
);
);
private:
//! Disable copy constructor
@@ -161,4 +161,5 @@ class PrintfDbgHSA : public PrintfDbg {
PrintfDbgHSA& operator=(const PrintfDbgHSA&);
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+32 -37
Просмотреть файл
@@ -65,10 +65,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
align = amd::alignUp(align, sizeof(uint32_t));
amd::Memory* amd_mem_obj = new (prog.dev().context())
amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
// HIP requires SVM allocation for segment code due to possible global variable access and
// global variables are a part of code segment with the latest loader
amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
// HIP requires SVM allocation for segment code due to possible global variable
// access and global variables are a part of code segment with the latest loader
amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
if (amd_mem_obj == nullptr) {
LogError("[OCL] failed to create a mem object!");
@@ -103,9 +103,9 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
if (zero && !prog.isInternal()) {
uint64_t pattern = 0;
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
amd::Coord3D(0), amd::Coord3D(size));
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
amd::Coord3D(size));
}
switch (segment) {
@@ -237,7 +237,7 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
}
bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize) {
#if defined(WITH_COMPILER_LIB)
#if defined(WITH_COMPILER_LIB)
// ACL_TYPE_CG stage is not performed for offline compilation
hsa_agent_t agent;
agent.handle = 1;
@@ -262,8 +262,8 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
}
size_t kernelNamesSize = 0;
acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES,
nullptr, nullptr, &kernelNamesSize);
acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
nullptr, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
return false;
@@ -274,11 +274,11 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
&kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
delete [] kernelNames;
delete[] kernelNames;
return false;
}
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
delete [] kernelNames;
delete[] kernelNames;
bool dynamicParallelism = false;
for (const auto& it : vKernels) {
std::string kernelName(it);
@@ -338,12 +338,10 @@ bool HSAILProgram::allocKernelTable() {
return true;
}
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const {
gpu.addVmMemory(&codeSegGpu());
}
void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); }
const aclTargetInfo& HSAILProgram::info(const char* str) {
#if defined(WITH_COMPILER_LIB)
#if defined(WITH_COMPILER_LIB)
acl_error err;
std::string arch = "hsail";
if (dev().settings().use64BitPtr_) {
@@ -359,7 +357,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
}
bool HSAILProgram::saveBinaryAndSetType(type_t type) {
#if defined(WITH_COMPILER_LIB)
#if defined(WITH_COMPILER_LIB)
// Write binary to memory
if (rawBinary_ != nullptr) {
// Free memory containing rawBinary
@@ -378,8 +376,8 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) {
return true;
}
bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
size_t* bytes, const char* global_name) const {
bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes,
const char* global_name) const {
uint32_t length = 0;
size_t offset = 0;
uint32_t flags = 0;
@@ -456,7 +454,7 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
}
/* Retrieve the Offset from global pal::Memory created @ segment::alloc */
if(!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
if (!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
buildLog_ += "Error: Cannot Retrieve the Address Offset";
buildLog_ += "\n";
return false;
@@ -484,13 +482,12 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
hsa_isa_t isa = {0};
uint32_t gfxip = 0;
uint32_t gfxip = 0;
std::string gfx_target(name);
if (gfx_target.find("amdgcn-") == 0) {
std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3);
gfxip = std::atoi(gfxip_version_str.c_str());
}
else {
} else {
// FIXME: Old way. To be remove.
uint32_t shift = 1;
size_t last = gfx_target.length();
@@ -508,9 +505,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
}
bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
uint32_t gfxipVersion = program_->dev().settings().useLightning_ ?
program_->dev().hwInfo()->gfxipVersionLC_ :
program_->dev().hwInfo()->gfxipVersion_;
uint32_t gfxipVersion = program_->dev().settings().useLightning_
? program_->dev().hwInfo()->gfxipVersionLC_
: program_->dev().hwInfo()->gfxipVersion_;
uint32_t majorSrc = gfxipVersion / 10;
uint32_t minorSrc = gfxipVersion % 10;
@@ -519,11 +516,9 @@ bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
if (majorSrc != majorTrg) {
return false;
}
else if (minorTrg == minorSrc) {
} else if (minorTrg == minorSrc) {
return true;
}
else if (minorTrg < minorSrc) {
} else if (minorTrg < minorSrc) {
LogWarning("ISA downgrade for execution!");
return true;
}
@@ -708,7 +703,7 @@ static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executabl
return HSA_STATUS_SUCCESS;
}
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
bool LightningProgram::createBinary(amd::option::Options* options) {
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -716,7 +711,7 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
LogError("Failed to create ELF binary image!");
return false;
}
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}
@@ -752,10 +747,10 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
}
#if defined(USE_COMGR_LIBRARY)
for (const auto &kernelMeta : kernelMetadataMap_) {
for (const auto& kernelMeta : kernelMetadataMap_) {
auto kernelName = kernelMeta.first;
auto kernel = new LightningKernel(kernelName, this,
options->origOptionStr + ProcessOptions(options));
auto kernel =
new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
kernels()[kernelName] = kernel;
if (!kernel->init()) {
@@ -804,9 +799,9 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
maxScratchRegs_ =
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(USE_COMGR_LIBRARY)
DestroySegmentCpuAccess();
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}
+31 -28
Просмотреть файл
@@ -9,15 +9,15 @@
namespace amd {
namespace option {
class Options;
} // option
} // namespace option
namespace hsa {
namespace loader {
class Loader;
class Executable;
class Context;
} // loader
} // hsa
} // amd
} // namespace loader
} // namespace hsa
} // namespace amd
//! \namespace pal PAL Device Implementation
namespace pal {
@@ -50,15 +50,16 @@ class Segment : public amd::HeapObject {
bool gpuAddressOffset(uint64_t offAddr, size_t* offset);
//! Returns address for CPU access in the segment
void* cpuAddress(size_t offset) const
{ return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; }
void* cpuAddress(size_t offset) const {
return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset;
}
void DestroyCpuAccess();
private:
Memory* gpuAccess_; //!< GPU memory for segment access
Memory* cpuAccess_; //!< CPU memory for segment (backing store)
address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
Memory* gpuAccess_; //!< GPU memory for segment access
Memory* cpuAccess_; //!< CPU memory for segment (backing store)
address cpuMem_; //!< CPU memory for segment without GPU direct access (backing store)
};
class PALHSALoaderContext final : public Context {
@@ -166,7 +167,7 @@ class HSAILProgram : public device::Program {
}
//! Get symbol by name
amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
return executable_->GetSymbol(symbol_name, agent);
}
@@ -180,11 +181,14 @@ class HSAILProgram : public device::Program {
virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize) override;
//! Destroys CPU allocations in the code segment
void DestroySegmentCpuAccess() const
{ if (codeSegment_ != nullptr) { codeSegment_->DestroyCpuAccess(); } }
void DestroySegmentCpuAccess() const {
if (codeSegment_ != nullptr) {
codeSegment_->DestroyCpuAccess();
}
}
virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr,
size_t* bytes, const char* globalName) const;
virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, size_t* bytes,
const char* globalName) const;
private:
//! Disable default copy constructor
@@ -201,7 +205,7 @@ class HSAILProgram : public device::Program {
std::vector<Memory*> globalStores_; //!< Global memory for the program
Memory* kernels_; //!< Table with kernel object pointers
Memory* codeSegGpu_; //!< GPU memory with code objects
Segment* codeSegment_; //!< Pointer to the code segment for this program
Segment* codeSegment_; //!< Pointer to the code segment for this program
uint
maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
@@ -214,19 +218,17 @@ class HSAILProgram : public device::Program {
//! \class Lightning Compiler Program
class LightningProgram : public HSAILProgram {
public:
LightningProgram(NullDevice& device)
: HSAILProgram(device) {
isLC_ = true;
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
machineTarget_ = dev().hwInfo()->machineTargetLC_;
}
LightningProgram(NullDevice& device) : HSAILProgram(device) {
isLC_ = true;
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
machineTarget_ = dev().hwInfo()->machineTargetLC_;
}
LightningProgram(Device& device)
: HSAILProgram(device) {
isLC_ = true;
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
machineTarget_ = dev().hwInfo()->machineTargetLC_;
}
LightningProgram(Device& device) : HSAILProgram(device) {
isLC_ = true;
xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
machineTarget_ = dev().hwInfo()->machineTargetLC_;
}
virtual ~LightningProgram() {}
protected:
@@ -235,4 +237,5 @@ class LightningProgram : public HSAILProgram {
virtual bool createBinary(amd::option::Options* options) override;
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+161 -187
Просмотреть файл
@@ -41,8 +41,8 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
if (memRef != nullptr) {
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
if ((result != Pal::Result::Success) &&
// Free cache if PAL failed allocation
dev.resourceCache().free()) {
// Free cache if PAL failed allocation
dev.resourceCache().free()) {
// If cache was freed, then try to allocate again
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
}
@@ -154,8 +154,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
// ================================================================================================
GpuMemoryReference::GpuMemoryReference(const Device& dev)
: gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr)
{}
: gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {}
// ================================================================================================
GpuMemoryReference::~GpuMemoryReference() {
@@ -181,8 +180,7 @@ GpuMemoryReference::~GpuMemoryReference() {
iMem()->Unmap();
}
if (0 != iMem()) {
if (!(iMem()->Desc().flags.isShared ||
iMem()->Desc().flags.isExternal ||
if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal ||
iMem()->Desc().flags.isExternPhys)) {
// Update free memory size counters
device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true);
@@ -368,7 +366,7 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
case Persistent:
createInfo->heapCount = 2;
createInfo->heaps[0] = Pal::GpuHeapLocal;
createInfo->heaps[1] = Pal:: GpuHeapGartUswc;
createInfo->heaps[1] = Pal::GpuHeapGartUswc;
#ifdef ATI_OS_LINUX
// Note: SSG in Linux requires DGMA heap
if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) {
@@ -401,11 +399,10 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
}
// ================================================================================================
bool Resource::CreateImage(CreateParams* params)
{
bool Resource::CreateImage(CreateParams* params) {
Pal::Result result;
Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
@@ -417,8 +414,7 @@ bool Resource::CreateImage(CreateParams* params)
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
offset_ += viewOwner_->offset_;
}
else {
} else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize();
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
@@ -427,8 +423,8 @@ bool Resource::CreateImage(CreateParams* params)
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, nullptr, &subOffset_);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -477,16 +473,16 @@ bool Resource::CreateImage(CreateParams* params)
imgCreateInfo.arraySize = 1;
switch (desc_.topology_) {
case CL_MEM_OBJECT_IMAGE3D:
imgCreateInfo.imageType = Pal::ImageType::Tex3d;
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
imgCreateInfo.imageType = Pal::ImageType::Tex1d;
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
case CL_MEM_OBJECT_IMAGE3D:
imgCreateInfo.imageType = Pal::ImageType::Tex3d;
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
imgCreateInfo.imageType = Pal::ImageType::Tex1d;
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
}
if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
@@ -504,8 +500,7 @@ bool Resource::CreateImage(CreateParams* params)
ImgSubresRange.startSubres.arraySlice = imageView->layer_;
viewOwner_ = imageView->resource_;
image_ = viewOwner_->image_;
}
else if (memoryType() == ImageBuffer) {
} else if (memoryType() == ImageBuffer) {
ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
viewOwner_ = imageBuffer->resource_;
}
@@ -515,11 +510,11 @@ bool Resource::CreateImage(CreateParams* params)
ImgSubresRange.numMips = desc().mipLevels_;
if ((memoryType() != ImageView) ||
//! @todo PAL doesn't allow an SRD view creation with different pixel size
(elementSize() != viewOwner_->elementSize())) {
//! @todo PAL doesn't allow an SRD view creation with different pixel size
(elementSize() != viewOwner_->elementSize())) {
imgCreateInfo.usageFlags.shaderRead = true;
imgCreateInfo.usageFlags.shaderWrite =
(format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
(format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
imgCreateInfo.swizzledFormat.format = format;
imgCreateInfo.swizzledFormat.swizzle = channels;
imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
@@ -529,10 +524,9 @@ bool Resource::CreateImage(CreateParams* params)
uint32_t rowPitch = 0;
if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) ||
(memoryType() == ImageBuffer)) {
(memoryType() == ImageBuffer)) {
tiling = Pal::ImageTiling::Linear;
}
else if (memoryType() == ImageView) {
} else if (memoryType() == ImageView) {
tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
// Find the new pitch in pixels for the new format
rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize();
@@ -540,10 +534,9 @@ bool Resource::CreateImage(CreateParams* params)
if (memoryType() == ImageBuffer) {
if ((params->owner_ != NULL) && params->owner_->asImage() &&
(params->owner_->asImage()->getRowPitch() != 0)) {
(params->owner_->asImage()->getRowPitch() != 0)) {
rowPitch = params->owner_->asImage()->getRowPitch() / elementSize();
}
else {
} else {
rowPitch = desc().width_;
}
}
@@ -579,8 +572,8 @@ bool Resource::CreateImage(CreateParams* params)
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, nullptr, &subOffset_);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -589,8 +582,7 @@ bool Resource::CreateImage(CreateParams* params)
}
}
offset_ += static_cast<size_t>(subOffset_);
}
else {
} else {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
@@ -627,11 +619,10 @@ bool Resource::CreateImage(CreateParams* params)
}
// ================================================================================================
bool Resource::CreateInterop(CreateParams* params)
{
bool Resource::CreateInterop(CreateParams* params) {
Pal::Result result;
Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {};
@@ -645,21 +636,21 @@ bool Resource::CreateInterop(CreateParams* params)
OGLInteropParams* oglRes = reinterpret_cast<OGLInteropParams*>(params);
assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
switch (oglRes->type_) {
case InteropVertexBuffer:
glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
break;
case InteropRenderBuffer:
glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
break;
case InteropTexture:
case InteropTextureViewLevel:
case InteropTextureViewCube:
glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
break;
default:
LogError("Unknown OGL interop type!");
return false;
break;
case InteropVertexBuffer:
glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
break;
case InteropRenderBuffer:
glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
break;
case InteropTexture:
case InteropTextureViewLevel:
case InteropTextureViewCube:
glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
break;
default:
LogError("Unknown OGL interop type!");
return false;
break;
}
glPlatformContext_ = oglRes->glPlatformContext_;
layer = oglRes->layer_;
@@ -667,17 +658,18 @@ bool Resource::CreateInterop(CreateParams* params)
mipLevel = oglRes->mipLevel_;
if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_,
&openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
&openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
#ifdef ATI_OS_WIN
, openInfo.doppDesktopInfo
,
openInfo.doppDesktopInfo
#endif
)) {
)) {
return false;
}
desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0);
format = dev().getPalFormat(desc().format_, &channels);
}
#ifdef ATI_OS_WIN
#ifdef ATI_OS_WIN
else {
D3DInteropParams* d3dRes = reinterpret_cast<D3DInteropParams*>(params);
openInfo.hExternalResource = d3dRes->handle_;
@@ -713,8 +705,8 @@ bool Resource::CreateInterop(CreateParams* params)
size_t gpuMemSize;
if (Pal::Result::Success !=
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
&imgCreateInfo)) {
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
&imgCreateInfo)) {
return false;
}
@@ -736,51 +728,51 @@ bool Resource::CreateInterop(CreateParams* params)
imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch;
switch (misc) {
case 1: // NV12 or P010 formats
switch (layer) {
case -1:
case 0:
case 1: // NV12 or P010 formats
switch (layer) {
case -1:
case 0:
break;
case 1:
// Y - plane size to the offset
// NV12 format. UV is 2 times smaller plane Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
break;
case 1:
// Y - plane size to the offset
// NV12 format. UV is 2 times smaller plane Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
case 2: // YV12 format
switch (layer) {
case -1:
case 0:
break;
case 1:
// Y - plane size to the offset
// YV12 format. U is 4 times smaller plane than Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.rowPitch >>= 1;
break;
case 2:
// Y + U plane sizes to the offest.
// U plane is 4 times smaller than Y and U == V
viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
imgCreateInfo.rowPitch >>= 1;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
case 3: // YUY2 format
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
break;
case 2: // YV12 format
switch (layer) {
case -1:
case 0:
break;
case 1:
// Y - plane size to the offset
// YV12 format. U is 4 times smaller plane than Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.rowPitch >>= 1;
break;
case 2:
// Y + U plane sizes to the offest.
// U plane is 4 times smaller than Y and U == V
viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
imgCreateInfo.rowPitch >>= 1;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
case 3: // YUY2 format
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
@@ -820,8 +812,7 @@ bool Resource::CreateInterop(CreateParams* params)
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
}
}
else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
if (nullptr == memRef_) {
return false;
@@ -842,8 +833,7 @@ bool Resource::CreateInterop(CreateParams* params)
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
}
else {
} else {
Pal::ExternalImageOpenInfo imgOpenInfo = {};
Pal::ImageCreateInfo imgCreateInfo = {};
imgOpenInfo.resourceInfo = openInfo;
@@ -865,14 +855,14 @@ bool Resource::CreateInterop(CreateParams* params)
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
viewInfo.viewType = Pal::ImageViewType::Tex2d;
switch (imgCreateInfo.imageType) {
case Pal::ImageType::Tex3d:
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case Pal::ImageType::Tex1d:
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
default:
break;
case Pal::ImageType::Tex3d:
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case Pal::ImageType::Tex1d:
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
default:
break;
}
viewInfo.pImage = image_;
viewInfo.swizzledFormat.format = format;
@@ -897,14 +887,13 @@ bool Resource::CreateInterop(CreateParams* params)
//! It's a workaround for D24S8 format, since PAL doesn't support this format
//! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility
if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) &&
(desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
if (dev().settings().gfx10Plus_) {
hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
}
else {
hwState_[1] &= ~0x3c000000;
hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
}
(desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
if (dev().settings().gfx10Plus_) {
hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
} else {
hwState_[1] &= ~0x3c000000;
hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
}
}
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
@@ -915,8 +904,7 @@ bool Resource::CreateInterop(CreateParams* params)
}
// ================================================================================================
bool Resource::CreatePinned(CreateParams* params)
{
bool Resource::CreatePinned(CreateParams* params) {
PinnedParams* pinned = reinterpret_cast<PinnedParams*>(params);
size_t allocSize = pinned->size_;
const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_;
@@ -926,7 +914,7 @@ bool Resource::CreatePinned(CreateParams* params)
if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
// Find the partial size for unaligned copy
hostMemOffset = static_cast<uint>(reinterpret_cast<const char*>(address_) - tmpHost);
@@ -940,18 +928,16 @@ bool Resource::CreatePinned(CreateParams* params)
}
allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
// hostMemOffset &= ~(0xff);
}
else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
//! @todo: Width has to be aligned for 3D.
//! Need to be replaced with a compute copy
// Width aligned by 8 texels
if (((desc().width_ % 0x8) != 0) ||
// Pitch aligned by 64 bytes
(((desc().width_ * elementSize()) % 0x40) != 0)) {
// Pitch aligned by 64 bytes
(((desc().width_ * elementSize()) % 0x40) != 0)) {
return false;
}
}
else {
} else {
//! @todo GSL doesn't support pinning with resAlloc_
return false;
}
@@ -978,8 +964,7 @@ bool Resource::CreatePinned(CreateParams* params)
}
// ================================================================================================
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
{
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) {
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
size_t allocSize = amd::alignUp(desc().width_ * elementSize_,
dev().properties().gpuMemoryProperties.fragmentSize);
@@ -991,20 +976,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
else {
} else {
createInfo.flags.useReservedGpuVa = false;
createInfo.pReservedGpuVaOwner = nullptr;
}
if (!dev().settings().svmFineGrainSystem_) {
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
createInfo.pReservedGpuVaOwner, &subOffset_);
}
if (memRef_ == nullptr) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
}
else {
} else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = allocSize;
createInfo.alignment = MaxGpuAlignment;
@@ -1015,8 +998,8 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
createInfo.pReservedGpuVaOwner, &subOffset_);
if (memRef_ == nullptr) {
createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
@@ -1028,9 +1011,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
}
desc_.cardMemory_ = false;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
(nullptr != params->owner_->getSvmPtr())) {
params->owner_->setSvmPtr(
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
offset_ += static_cast<size_t>(subOffset_);
}
return true;
@@ -1126,18 +1109,18 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
Pal::gpusize svmPtr = 0;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
desc_.SVMRes_ = true;
svmPtr = (svmPtr == 1) ? 0 : svmPtr;
svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
desc_.SVMRes_ = true;
svmPtr = (svmPtr == 1) ? 0 : svmPtr;
}
if (desc_.SVMRes_) {
return CreateSvm(params, svmPtr);
return CreateSvm(params, svmPtr);
}
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize_;
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment;
createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
@@ -1152,8 +1135,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, nullptr, &subOffset_);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -1172,14 +1155,13 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
}
// ================================================================================================
void Resource::free()
{
void Resource::free() {
if (memRef_ == nullptr) {
return;
}
const bool wait =
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
// and resource can be reused on another async queue without a wait on a busy operation
@@ -1190,8 +1172,7 @@ void Resource::free()
for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->waitForEvent(&events_[idx]);
}
}
else {
} else {
amd::ScopedLock l(memRef_->gpu_->execution());
memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
}
@@ -1232,8 +1213,7 @@ void Resource::free()
// ================================================================================================
void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data,
bool waitForEvent) const
{
bool waitForEvent) const {
GpuEvent event;
// Write data size bytes to surface
@@ -1242,7 +1222,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
gpu.eventBegin(MainEngine);
gpu.queue(MainEngine).addCmdMemRef(memRef());
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size,
reinterpret_cast<const uint32_t*>(data));
reinterpret_cast<const uint32_t*>(data));
gpu.eventEnd(MainEngine, event);
if (waitForEvent) {
@@ -1259,8 +1239,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
}
// ================================================================================================
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
{
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) {
if (bytesPerElement == 16) {
return Pal::ChNumFormat::X32Y32Z32W32_Uint;
} else if (bytesPerElement == 8) {
@@ -1292,8 +1271,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
if (desc().buffer_ && !dstResource.desc().buffer_) {
imageOffsetx = dstOrigin[0] % dstResource.elementSize();
gpuMemoryOffset = srcOrigin[0] + offset();
gpuMemoryRowPitch =
(srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
@@ -1374,7 +1352,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2]
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
? dstOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, &copyRegion);
} else {
@@ -1819,17 +1798,14 @@ void Resource::unmap(VirtualGPU* gpu) {
}
// ================================================================================================
void Resource::unmapLayers(VirtualGPU* gpu) {
Unimplemented();
}
void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); }
// ================================================================================================
bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
MemBuddyAllocator* allocator = new MemBuddyAllocator(
device_, device_->settings().subAllocationChunkSize_,
device_->settings().subAllocationMinSize_);
if (!((allocator != nullptr) &&
(allocator->Init() == Pal::Result::Success) &&
MemBuddyAllocator* allocator =
new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_,
device_->settings().subAllocationMinSize_);
if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) &&
heaps_.insert({mem_ref, allocator}).second)) {
mem_ref->release();
delete allocator;
@@ -1890,8 +1866,7 @@ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
}
// ================================================================================================
MemorySubAllocator::~MemorySubAllocator()
{
MemorySubAllocator::~MemorySubAllocator() {
// Release memory heap for suballocations
for (const auto& it : heaps_) {
it.first->release();
@@ -1901,8 +1876,8 @@ MemorySubAllocator::~MemorySubAllocator()
// ================================================================================================
GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
{
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset) {
GpuMemoryReference* mem_ref = nullptr;
MemBuddyAllocator* allocator = nullptr;
// Check if the resource size and alignment are allowed for suballocation
@@ -1927,7 +1902,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
}
// We didn't find a valid chunk, so create a new one
if (!CreateChunk(reserved_va)) {
return nullptr;
return nullptr;
}
i++;
} while (i < 2);
@@ -1936,8 +1911,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
}
// ================================================================================================
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
{
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) {
bool release_mem = false;
{
amd::ScopedLock l(monitor);
@@ -1966,9 +1940,8 @@ ResourceCache::~ResourceCache() { free(); }
// ================================================================================================
//! \note the cache works in FILO mode
bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
GpuMemoryReference* ref, Pal::gpusize offset)
{
bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref,
Pal::gpusize offset) {
bool result = false;
size_t size = ref->iMem()->Desc().size;
@@ -2017,7 +1990,9 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
// ================================================================================================
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset) {
amd::ScopedLock l(&lockCacheOps_);
GpuMemoryReference* ref = nullptr;
@@ -2051,7 +2026,7 @@ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal
ref = it.second;
cacheSize_ -= sizeRes;
if (entry->type_ == Resource::Local) {
lclCacheSize_ -= sizeRes;
lclCacheSize_ -= sizeRes;
}
delete it.first;
// Remove the found etry from the cache
@@ -2078,8 +2053,7 @@ bool ResourceCache::free(size_t minCacheEntries) {
}
// ================================================================================================
void ResourceCache::removeLast()
{
void ResourceCache::removeLast() {
std::pair<Resource::Descriptor*, GpuMemoryReference*> entry;
{
// Protect access to the global data
+66 -74
Просмотреть файл
@@ -41,11 +41,11 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
//! Get PAL memory object
Pal::IGpuMemory* iMem() const { return gpuMem_; }
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
void* cpuAddress_; //!< CPU address of this memory
const Device& device_; //!< GPU device
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
void* cpuAddress_; //!< CPU address of this memory
const Device& device_; //!< GPU device
//! @note: This field is necessary for the thread safe release only
VirtualGPU* gpu_; //!< Resource will be used only on this queue
VirtualGPU* gpu_; //!< Resource will be used only on this queue
protected:
//! Default destructor
@@ -186,7 +186,7 @@ class Resource : public amd::HeapObject {
//! Constructor of 1D Resource object
Resource(const Device& gpuDev, //!< GPU device object
size_t size //!< Resource size
);
);
//! Constructor of Image Resource object
Resource(const Device& gpuDev, //!< GPU device object
@@ -196,7 +196,7 @@ class Resource : public amd::HeapObject {
cl_image_format format, //!< resource format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels = 1 //!< Number of mip levels
);
);
//! Destructor of the resource
virtual ~Resource();
@@ -207,7 +207,7 @@ class Resource : public amd::HeapObject {
*/
virtual bool create(MemoryType memType, //!< memory type
CreateParams* params = 0 //!< special parameters for resource allocation
);
);
/*! \brief Copies a subregion of memory from one resource to another
*
@@ -253,14 +253,13 @@ class Resource : public amd::HeapObject {
Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
//! Returns a pointer to the memory reference
GpuMemoryReference* memRef() const {return memRef_; }
GpuMemoryReference* memRef() const { return memRef_; }
//! Returns global memory offset
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
//! Returns global memory offset
uint64_t vmSize() const
{ return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
uint64_t vmSize() const { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
//! Returns global memory offset
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
@@ -279,11 +278,11 @@ class Resource : public amd::HeapObject {
// Optimization for multilayer map/unmap
uint startLayer = 0, //!< Start layer for multilayer map
uint numLayers = 0 //!< End layer for multilayer map
);
);
//! Unlocks the resource if it was locked
void unmap(VirtualGPU* gpu //!< Virtual GPU device object
);
);
//! Marks the resource as busy
void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object
@@ -303,7 +302,7 @@ class Resource : public amd::HeapObject {
uint flags = 0, //!< Map flags
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
);
);
//! Performs host read from the resource GPU memory
bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object
@@ -312,7 +311,7 @@ class Resource : public amd::HeapObject {
const amd::Coord3D& size, //!< The number of bytes to write
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
);
);
//! Gets the resource element size
uint elementSize() const { return elementSize_; }
@@ -377,7 +376,7 @@ class Resource : public amd::HeapObject {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
}
}
@@ -390,33 +389,32 @@ class Resource : public amd::HeapObject {
protected:
/*! \brief Creates a PAL iamge object, associated with the resource
*
* \return True if we succesfully created a PAL resource
*/
bool CreateImage(CreateParams* params //!< special parameters for resource allocation
);
*
* \return True if we succesfully created a PAL resource
*/
bool CreateImage(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL interop object, associated with the resource
*
* \return True if we succesfully created a PAL interop resource
*/
bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
);
*
* \return True if we succesfully created a PAL interop resource
*/
bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL pinned object, associated with the resource
*
* \return True if we succesfully created a PAL pinned resource
*/
bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
);
*
* \return True if we succesfully created a PAL pinned resource
*/
bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL SVM object, associated with the resource
*
* \return True if we succesfully created a PAL SVM resource
*/
*
* \return True if we succesfully created a PAL SVM resource
*/
bool CreateSvm(CreateParams* params, //!< special parameters for resource allocation
Pal::gpusize svmPtr
);
Pal::gpusize svmPtr);
uint elementSize_; //!< Size of a single element in bytes
@@ -433,11 +431,11 @@ class Resource : public amd::HeapObject {
*/
void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object
uint flags = 0 //!< flags for the map operation
);
);
//! Unlocks the resource with layers if it was locked
void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object
);
);
//! Calls PAL to map a resource
void* gpuMemoryMap(size_t* pitch, //!< Pitch value for the image
@@ -454,7 +452,7 @@ class Resource : public amd::HeapObject {
//! Converts Resource memory type to the PAL heaps
void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info
);
);
const Device& gpuDevice_; //!< GPU device
Descriptor desc_; //!< Descriptor for this resource
@@ -462,7 +460,7 @@ class Resource : public amd::HeapObject {
void* address_; //!< Physical address of this resource
size_t offset_; //!< Resource offset
GpuMemoryReference* memRef_; //!< PAL resource reference
Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
const Resource* viewOwner_; //!< GPU resource, which owns this view
void* glInteropMbRes_; //!< Mb Res handle
uint32_t glType_; //!< GL interop type
@@ -485,41 +483,35 @@ class Resource : public amd::HeapObject {
typedef Util::BuddyAllocator<Device> MemBuddyAllocator;
class MemorySubAllocator : public amd::HeapObject {
public:
public:
MemorySubAllocator(Device* device) : device_(device) {}
~MemorySubAllocator();
//! Create suballocation
GpuMemoryReference* Allocate(Pal::gpusize size,
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset
);
GpuMemoryReference* Allocate(Pal::gpusize size, Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset);
//! Free suballocation
bool Free(amd::Monitor* monitor,
GpuMemoryReference* mem_ref,
Pal::gpusize offset
);
bool Free(amd::Monitor* monitor, GpuMemoryReference* mem_ref, Pal::gpusize offset);
protected:
protected:
//! Allocate new chunk of memory
virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
bool InitAllocator(GpuMemoryReference* mem_ref);
Device* device_;
std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
};
class CoarseMemorySubAllocator : public MemorySubAllocator {
public:
public:
CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
};
class FineMemorySubAllocator : public MemorySubAllocator {
public:
public:
FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
@@ -529,29 +521,28 @@ class ResourceCache : public amd::HeapObject {
public:
//! Default constructor
ResourceCache(Device* device, size_t cacheSizeLimit)
: lockCacheOps_("PAL resource cache", true)
, cacheSize_(0)
, lclCacheSize_(0)
, cacheSizeLimit_(cacheSizeLimit)
, mem_sub_alloc_local_(device)
, mem_sub_alloc_coarse_ (device)
, mem_sub_alloc_fine_ (device) {}
: lockCacheOps_("PAL resource cache", true),
cacheSize_(0),
lclCacheSize_(0),
cacheSizeLimit_(cacheSizeLimit),
mem_sub_alloc_local_(device),
mem_sub_alloc_coarse_(device),
mem_sub_alloc_fine_(device) {}
//! Default destructor
~ResourceCache();
//! Adds a PAL resource to the cache
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
GpuMemoryReference* ref, //!< Resource reference
Pal::gpusize offset //!< Original resource offset
);
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
GpuMemoryReference* ref, //!< Resource reference
Pal::gpusize offset //!< Original resource offset
);
//! Finds a PAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
Pal::gpusize size,
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
Pal::gpusize size, Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
Pal::gpusize* offset);
//! Destroys cache
@@ -576,16 +567,17 @@ class ResourceCache : public amd::HeapObject {
amd::Monitor lockCacheOps_; //!< Lock to serialise cache access
size_t cacheSize_; //!< Current cache size in bytes
size_t lclCacheSize_; //!< Local memory stored in the cache
const size_t cacheSizeLimit_; //!< Cache size limit in bytes
size_t cacheSize_; //!< Current cache size in bytes
size_t lclCacheSize_; //!< Local memory stored in the cache
const size_t cacheSizeLimit_; //!< Cache size limit in bytes
//! PAL resource cache
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+6 -6
Просмотреть файл
@@ -136,7 +136,7 @@ Settings::Settings() {
subAllocationMinSize_ = 4 * Ki;
subAllocationChunkSize_ = 64 * Mi;
subAllocationMaxSize_ =
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
maxCmdBuffers_ = 12;
useLightning_ = GPU_ENABLE_LC;
@@ -148,8 +148,7 @@ Settings::Settings() {
bool Settings::create(const Pal::DeviceProperties& palProp,
const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
bool reportAsOCL12Device)
{
bool reportAsOCL12Device) {
uint32_t osVer = 0x0;
// Disable thread trace by default for all devices
@@ -198,8 +197,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
case Pal::AsicRevision::Navi10Lite:
gfx10Plus_ = true;
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
hsailExplicitXnack_ = static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
|| palProp.gpuMemoryProperties.flags.iommuv2Support);
hsailExplicitXnack_ =
static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled ||
palProp.gpuMemoryProperties.flags.iommuv2Support);
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
if (useLightning_) {
enableWave32Mode_ = true;
@@ -346,7 +346,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
if (modifyMaxWorkload.time > 0) {
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
}
}
#endif // defined(_WIN32)
+54 -53
Просмотреть файл
@@ -39,63 +39,63 @@ class Settings : public device::Settings {
union {
struct {
uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
uint imageSupport_ : 1; //!< Report images support
uint doublePrecision_ : 1; //!< Enables double precision support
uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint viPlus_ : 1; //!< VI and post VI features
uint aiPlus_ : 1; //!< AI and post AI features
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
uint threadTraceEnable_ : 1; //!< Thread trace enable
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
uint svmAtomics_ : 1; //!< SVM device atomics
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
uint rgpSqttForceDisable_: 1; //!< Disables SQTT
uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint disablePersistent_ : 1; //!< Disables using persistent memory for staging
uint imageSupport_ : 1; //!< Report images support
uint doublePrecision_ : 1; //!< Enables double precision support
uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU
uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint viPlus_ : 1; //!< VI and post VI features
uint aiPlus_ : 1; //!< AI and post AI features
uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features
uint threadTraceEnable_ : 1; //!< Thread trace enable
uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent
uint useSingleScratch_ : 1; //!< Allocates single scratch per device
uint svmAtomics_ : 1; //!< SVM device atomics
uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support
uint useDeviceQueue_ : 1; //!< Submit to separate device queue
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
uint rgpSqttWaitIdle_ : 1; //!< Wait for idle after SQTT trace
uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
uint reserved_ : 11;
};
uint value_;
};
uint oclVersion_; //!< Reported OpenCL version support
uint debugFlags_; //!< Debug GPU flags
uint hwLDSSize_; //!< HW local data store size
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
uint preferredWorkGroupSize_;//!< Requested preferred workgroup size for this device
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t resourceCacheSize_; //!< Resource cache size in MB
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
uint64_t maxAllocSize_; //!< Maximum single allocation size
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
uint oclVersion_; //!< Reported OpenCL version support
uint debugFlags_; //!< Debug GPU flags
uint hwLDSSize_; //!< HW local data store size
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
uint preferredWorkGroupSize_; //!< Requested preferred workgroup size for this device
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t resourceCacheSize_; //!< Resource cache size in MB
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
uint64_t maxAllocSize_; //!< Maximum single allocation size
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
//! Default constructor
@@ -106,7 +106,7 @@ class Settings : public device::Settings {
const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings
const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings
bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device
);
);
private:
//! Disable copy constructor
@@ -119,4 +119,5 @@ class Settings : public device::Settings {
void override();
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+3 -2
Просмотреть файл
@@ -40,7 +40,7 @@ class TimeStamp : public amd::HeapObject {
Pal::IGpuMemory* iMem, //!< Buffer with the timer values
uint memOffset, //!< Offset in the buffer for the current TS
address cpuAddr //!< CPU pointer for the values in memory
);
);
//! Default destructor
~TimeStamp();
@@ -114,4 +114,5 @@ class TimeStampCache : public amd::HeapObject {
uint tsOffset_; //!< Active offset in the current mem object
};
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal
+208 -235
Просмотреть файл
@@ -70,8 +70,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
if (qCreateInfo.engineType == Pal::EngineTypeExclusiveCompute) {
if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
qCreateInfo.engineIndex = it->second;
}
else {
} else {
return nullptr;
}
}
@@ -97,8 +96,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
}
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
residency_limit, max_command_buffers);
VirtualGPU::Queue* queue =
new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = reinterpret_cast<address>(&queue[1]);
// Create PAL queue object
@@ -163,16 +162,16 @@ VirtualGPU::Queue::~Queue() {
}
}
Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile()
{
std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
// Find the last occurance of the '\\' character and extract the name of the application as wide char.
const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
// Find the last occurance of the '\\' character and extract the name of the application as wide
// char.
const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}
void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
@@ -188,8 +187,7 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
memRef.pGpuMemory = iMem;
palMemRefs_.push_back(memRef);
// Check SDI memory object
if (iMem->Desc().flags.isExternPhys &&
(sdiReferences_.find(iMem) == sdiReferences_.end())) {
if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
sdiReferences_.insert(iMem);
palSdiRefs_.push_back(iMem);
}
@@ -268,8 +266,7 @@ bool VirtualGPU::Queue::flush() {
// Submit command buffer to OS
Pal::Result result;
if (gpu_.rgpCaptureEna()) {
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
iQueue_, cmdBufIdCurrent_, submitInfo);
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
} else {
result = iQueue_->Submit(submitInfo);
}
@@ -383,28 +380,28 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
if (dump.is_open()) {
dump << start << " Queue: ";
switch (iQueue_->Type()) {
case Pal::QueueTypeCompute:
dump << "Compute";
break;
case Pal::QueueTypeDma:
dump << "SDMA";
break;
default:
dump << "unknown";
break;
case Pal::QueueTypeCompute:
dump << "Compute";
break;
case Pal::QueueTypeDma:
dump << "SDMA";
break;
default:
dump << "unknown";
break;
}
dump << "\n"
<< "Resident memory resources:\n";
<< "Resident memory resources:\n";
uint idx = 0;
for (auto it : memReferences_) {
dump << " " << idx << "\t[";
dump.setf(std::ios::hex, std::ios::basefield);
dump.setf(std::ios::showbase);
dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
<< (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
<< (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
dump.setf(std::ios::dec);
dump << "] CbId:" << it.second <<
", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n";
dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap
<< "\n";
idx++;
}
@@ -414,8 +411,7 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Find if the current argument is a memory object
if ((desc.type_ == T_POINTER) &&
(desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
dump << " " << desc.name_ << ": " << std::endl;
}
}
@@ -519,7 +515,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
// note: The array growth shouldn't occur under the normal conditions,
// but in a case when SVM path sends the amount of SVM ptrs over
// the max size of kernel arguments
MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
if (nullptr == ptr) {
numMemObjectsInQueue_ = 0;
return;
@@ -527,7 +523,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
maxMemObjectsInQueue_ <<= 1;
memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
delete[] memObjectsInQueue_;
memObjectsInQueue_= ptr;
memObjectsInQueue_ = ptr;
}
// Adjust the number of active objects
@@ -748,7 +744,6 @@ VirtualGPU::VirtualGPU(Device& device)
maskGroups_(1),
hsaQueueMem_(nullptr),
cmdAllocator_(nullptr) {
// Note: Virtual GPU device creation must be a thread safe operation
index_ = gpuDevice_.numOfVgpus_++;
gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
@@ -780,8 +775,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
createInfo.flags.autoMemoryReuse = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
@@ -803,8 +798,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0;
uint idx = index() % (dev().numComputeEngines() - firstQueue);
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
(dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
? 0
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
if (dev().numComputeEngines()) {
@@ -815,9 +811,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
cmdAllocator_, rtCUs, priority,
residency_limit, max_cmd_buffers);
queues_[MainEngine] =
Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs,
priority, residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
return false;
}
@@ -832,20 +828,19 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
sdma = 1;
}
queues_[SdmaEngine] =
Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
queues_[SdmaEngine] = Queue::Create(
*this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
} else {
queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
queues_[SdmaEngine] =
Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
}
} else {
Unimplemented();
@@ -921,7 +916,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
bool dbg_vmid = false;
state_.rgpCaptureEnabled_ = true;
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_,
&dbg_vmid);
}
return true;
@@ -1511,99 +1507,99 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
bool unmapMip = false;
amd::Image* amdImage;
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
amd::Memory* owner = memory->owner();
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
if (nullptr == writeMapInfo) {
LogError("Unmap without map call");
return;
}
profilingBegin(vcmd, true);
// Check if image is a mipmap and assign a saved view
amdImage = owner->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
(writeMapInfo->baseMip_ != nullptr)) {
// Assign mip level view
amdImage = writeMapInfo->baseMip_;
// Clear unmap flags from the parent image
memory->clearUnmapInfo(vcmd.mapPtr());
memory = dev().getGpuMemory(amdImage);
unmapMip = true;
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
}
// We used host memory
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
if (writeMapInfo->isUnmapWrite()) {
// Target is the backing store, so sync
owner->signalWrite(nullptr);
memory->syncCacheFromHost(*this);
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
amd::Memory* owner = memory->owner();
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
if (nullptr == writeMapInfo) {
LogError("Unmap without map call");
return;
}
// Remove memory from VA cache
dev().removeVACache(memory);
}
// data check was added for persistent memory that failed to get aperture
// and therefore are treated like a remote resource
else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
memory->unmap(this);
} else if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
Memory* memoryBuf = memory;
amd::Coord3D origin(writeMapInfo->origin_[0]);
amd::Coord3D size(writeMapInfo->region_[0]);
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
profilingBegin(vcmd, true);
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
// Check if image is a mipmap and assign a saved view
amdImage = owner->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
(writeMapInfo->baseMip_ != nullptr)) {
// Assign mip level view
amdImage = writeMapInfo->baseMip_;
// Clear unmap flags from the parent image
memory->clearUnmapInfo(vcmd.mapPtr());
memory = dev().getGpuMemory(amdImage);
unmapMip = true;
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
}
// We used host memory
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
if (writeMapInfo->isUnmapWrite()) {
// Target is the backing store, so sync
owner->signalWrite(nullptr);
memory->syncCacheFromHost(*this);
}
// Remove memory from VA cache
dev().removeVACache(memory);
}
// data check was added for persistent memory that failed to get aperture
// and therefore are treated like a remote resource
else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
memory->unmap(this);
} else if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
Memory* memoryBuf = memory;
amd::Coord3D origin(writeMapInfo->origin_[0]);
amd::Coord3D size(writeMapInfo->region_[0]);
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} else {
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} else {
if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
}
} else {
LogError("Unhandled unmap!");
vcmd.setStatus(CL_INVALID_VALUE);
}
} else {
LogError("Unhandled unmap!");
vcmd.setStatus(CL_INVALID_VALUE);
// Clear unmap flags
memory->clearUnmapInfo(vcmd.mapPtr());
profilingEnd(vcmd);
}
// Clear unmap flags
memory->clearUnmapInfo(vcmd.mapPtr());
profilingEnd(vcmd);
}
// Release a view for a mipmap map
if (unmapMip) {
// Memory release should be outside of the execution lock,
@@ -1700,9 +1696,9 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
profilingBegin(cmd);
Memory* srcDevMem = static_cast<pal::Memory*>(
cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* dstDevMem = static_cast<pal::Memory*>(
cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
bool p2pAllowed = false;
#if 0
@@ -1728,16 +1724,15 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
if (p2pAllowed) {
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin,
size, cmd.isEntireMemory());
}
else {
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
cmd.isEntireMemory());
} else {
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
@@ -1750,11 +1745,11 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
amd::Coord3D cpSize(copy_size);
// Perform 2 step transfer with staging buffer
result &= dev().xferMgr().copyBuffer(
*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
result &=
dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
srcOrigin.c[0] += copy_size;
result &= dstDevMem->dev().xferMgr().copyBuffer(
*srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize);
result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
dstOrigin, cpSize);
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
@@ -1940,10 +1935,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
}
// ================================================================================================
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue)
{
AmdAqlWrap* wraps =
(AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) {
AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
uint p = 0;
for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
if (wraps[i].state != 0) {
@@ -1963,11 +1956,9 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
print << "\twait_list: " << wraps[i].wait_list << "\n";
print << "\twait_num: " << wraps[i].wait_num << "\n";
uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
size_t* events =
reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
for (j = 0; j < wraps[i].wait_num; ++j) {
uint offs =
static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
print << "Wait Event#: " << j << "\n";
print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
@@ -1980,8 +1971,8 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
print << wraps[i].aql.grid_size_z << "]\n";
HSAILKernel* child = nullptr;
for (auto it = hsaKernel.prog().kernels().begin();
it != hsaKernel.prog().kernels().end(); ++it) {
for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
++it) {
if (wraps[i].aql.kernel_object == static_cast<HSAILKernel*>(it->second)->gpuAqlCode()) {
child = static_cast<HSAILKernel*>(it->second);
}
@@ -1995,7 +1986,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
const amd::KernelSignature& signature = child->signature();
const amd::KernelSignature& signature = child->signature();
// Check if runtime has to setup hidden arguments
for (const auto it : signature.parameters()) {
@@ -2033,7 +2024,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
continue;
}
print << "\t" << it.name_ << ": ";
for (int s = it.size_- 1; s >= 0; --s) {
for (int s = it.size_ - 1; s >= 0; --s) {
print.width(2);
print.fill('0');
print << static_cast<uint32_t>(argum[s]);
@@ -2047,26 +2038,20 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
}
// ================================================================================================
bool VirtualGPU::PreDeviceEnqueue(
const amd::Kernel& kernel,
const HSAILKernel& hsaKernel,
VirtualGPU** gpuDefQueue,
uint64_t* vmDefQueue)
{
bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
if (nullptr == defQueue) {
LogError("Default device queue wasn't allocated");
return false;
}
else {
} else {
if (dev().settings().useDeviceQueue_) {
*gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
if ((*gpuDefQueue)->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
}
else {
} else {
createVirtualQueue(defQueue->size());
*gpuDefQueue = this;
}
@@ -2086,15 +2071,10 @@ bool VirtualGPU::PreDeviceEnqueue(
}
// ================================================================================================
void VirtualGPU::PostDeviceEnqueue(
const amd::Kernel& kernel,
const HSAILKernel& hsaKernel,
VirtualGPU* gpuDefQueue,
uint64_t vmDefQueue,
uint64_t vmParentWrap,
GpuEvent* gpuEvent)
{
uint32_t id = gpuEvent->id_;
void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
uint64_t vmParentWrap, GpuEvent* gpuEvent) {
uint32_t id = gpuEvent->id_;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
// Make sure exculsive access to the device queue
@@ -2110,16 +2090,16 @@ void VirtualGPU::PostDeviceEnqueue(
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
dev().settings().useDeviceQueue_);
vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
dev().settings().useDeviceQueue_);
eventEnd(MainEngine, *gpuEvent);
}
// Get the global loop start before the scheduler
Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
@@ -2127,8 +2107,7 @@ void VirtualGPU::PostDeviceEnqueue(
//! @note DMA flush must not occur between patch and the scheduler
Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
SchedulerParam* param = reinterpret_cast<SchedulerParam*>(
gpuDefQueue->schedParams_->data());
SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
param->signal = 1;
// Scale clock to 1024 to avoid 64 bit div in the scheduler
param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2147,8 +2126,7 @@ void VirtualGPU::PostDeviceEnqueue(
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
addVmMemory(scratchBuf);
}
else {
} else {
param->numMaxWaves = 0;
param->scratchSize = 0;
param->scratch = 0;
@@ -2162,8 +2140,8 @@ void VirtualGPU::PostDeviceEnqueue(
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
gpuDefQueue->eventBegin(MainEngine);
gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
signalAddr, loopStart,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
signalAddr, loopStart,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
// Thus TS command for profiling has to follow in the next CB.
constexpr bool ForceSubmitFirst = true;
@@ -2173,10 +2151,10 @@ void VirtualGPU::PostDeviceEnqueue(
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
dev().settings().useDeviceQueue_);
if (id != gpuEvent->id_) {
LogError("Something is wrong. ID mismatch!\n");
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, *gpuEvent);
}
@@ -2193,7 +2171,8 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
vcmd.sharedMemBytes())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2203,10 +2182,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
amd::Event* enqueueEvent, uint32_t sharedMemBytes)
{
size_t newOffset[3] = { 0, 0, 0 };
size_t newGlobalSize[3] = { 0, 0, 0 };
amd::Event* enqueueEvent, uint32_t sharedMemBytes) {
size_t newOffset[3] = {0, 0, 0};
size_t newGlobalSize[3] = {0, 0, 0};
int dim = -1;
int iteration = 1;
@@ -2221,17 +2199,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// If RGP capturing is enabled, then start SQTT trace
if (rgpCaptureEna()) {
size_t newLocalSize[3] = { 1, 1, 1 };
size_t newLocalSize[3] = {1, 1, 1};
for (uint i = 0; i < sizes.dimensions(); i++) {
if (sizes.local()[i] != 0) {
newLocalSize[i] = sizes.local()[i];
}
}
dev().rgpCaptureMgr()->PreDispatch(this, hsaKernel,
// Report global size in workgroups, since that's the RGP trace semantics
newGlobalSize[0] / newLocalSize[0],
newGlobalSize[1] / newLocalSize[1],
newGlobalSize[2] / newLocalSize[2]);
dev().rgpCaptureMgr()->PreDispatch(
this, hsaKernel,
// Report global size in workgroups, since that's the RGP trace semantics
newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
newGlobalSize[2] / newLocalSize[2]);
}
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
@@ -2257,8 +2235,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
LogError("Wrong memory objects!");
return false;
LogError("Wrong memory objects!");
return false;
}
bool needFlush = false;
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
@@ -2305,15 +2283,14 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// an extra loop is required.
const amd::KernelParameters& kernelParams = kernel.parameters();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
if (nativeMem) {
Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
if (gpuMem != nullptr) {
gpuMem->setBusy(*this, gpuEvent);
}
}
else {
} else {
amd::Memory* mem = memories[i];
if (mem != nullptr) {
dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
@@ -2325,7 +2302,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2348,8 +2325,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
@@ -2660,7 +2636,6 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
eventEnd(MainEngine, gpuEvent);
} else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
EngineType activeEngineID = engineID_;
engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);
@@ -2669,8 +2644,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
// Workarounds: We had systems where an extra delay was necessary.
{
// Flush CB associated with the DGMA buffer
isDone(pGpuMemory->getGpuEvent(*this));
// Flush CB associated with the DGMA buffer
isDone(pGpuMemory->getGpuEvent(*this));
}
eventBegin(engineID_);
@@ -2711,10 +2686,11 @@ void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd
pGpuMems[i] = pGpuMemory->iMem();
}
dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim);
dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
Pal::GpuMemoryRefCantTrim);
dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
if (numObjects != 0) {
dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
}
for (uint i = 0; i < numObjects; i++) {
@@ -3104,8 +3080,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
break;
}
// get svm non arugment information
void* const* svmPtrArray = reinterpret_cast<void* const*>(
params + kernelParams.getExecInfoOffset());
void* const* svmPtrArray =
reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
for (size_t i = 0; i < count; i++) {
amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
if (nullptr == memory) {
@@ -3149,8 +3125,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
bool srdResource = false;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
const HSAILKernel& hsaKernel =
static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
const amd::KernelSignature& signature = kernel.signature();
ldsAddress = hsaKernel.ldsSize();
@@ -3225,10 +3200,10 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
addVmMemory(gpuMem);
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
desc.typeName_.c_str(), desc.name_.c_str(),
globalAddress, reinterpret_cast<void*>(gpuMem->vmAddress()),
reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
std::this_thread::get_id());
desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
reinterpret_cast<void*>(gpuMem->vmAddress()),
reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
std::this_thread::get_id());
//! Check if compiler expects read/write.
//! Note: SVM with subbuffers has an issue with tracking.
@@ -3255,30 +3230,28 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
if (gpuMem->desc().isDoppTexture_) {
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
kernel.parameters().getExecPfpaVcop());
}
}
}
}
}
else if (desc.type_ == T_VOID) {
} else if (desc.type_ == T_VOID) {
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
// Copy the current structure into CB1
size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(
params + desc.offset_, desc.size_));
size_t gpuPtr =
static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
// Then use a pointer in aqlArgBuffer to CB1
const auto it = hsaKernel.patch().find(desc.offset_);
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
addVmMemory(cb(1)->ActiveMemory());
}
}
else if (desc.type_ == T_SAMPLER) {
} else if (desc.type_ == T_SAMPLER) {
srdResource = true;
} else if (desc.type_ == T_QUEUE) {
uint32_t index = desc.info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
params + kernelParams.queueObjOffset())[index];
const amd::DeviceQueue* queue =
reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
+100 -103
Просмотреть файл
@@ -51,17 +51,18 @@ class VirtualGPU : public device::VirtualDevice {
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
uint rtCU, //!< The number of reserved CUs
amd::CommandQueue::Priority priority, //!< Queue priority
uint64_t residency_limit, //!< Enables residency limit
uint max_command_buffers //!< Number of allocated command buffers
);
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
uint rtCU, //!< The number of reserved CUs
amd::CommandQueue::Priority priority, //!< Queue priority
uint64_t residency_limit, //!< Enables residency limit
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
uint max_command_buffers)
: iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
@@ -75,8 +76,7 @@ class VirtualGPU : public device::VirtualDevice {
vlAlloc_(64 * Ki),
residency_size_(0),
residency_limit_(residency_limit),
max_command_buffers_(max_command_buffers)
{
max_command_buffers_(max_command_buffers) {
vlAlloc_.Init();
}
@@ -100,8 +100,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::Result UpdateAppPowerProfile();
// ibReuse forces event wait without polling, to make sure event occured
template <bool ibReuse>
bool waifForFence(uint cbId) const {
template <bool ibReuse> bool waifForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
uint64_t start;
uint64_t end;
@@ -138,8 +137,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
template <bool avoidBarrierSubmit = false>
uint submit(bool forceFlush);
template <bool avoidBarrierSubmit = false> uint submit(bool forceFlush);
bool flush();
@@ -151,28 +149,28 @@ class VirtualGPU : public device::VirtualDevice {
uint cmdBufId() const { return cmdBufIdCurrent_; }
Pal::IQueue* iQueue_; //!< PAL queue object
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
const amd::Kernel* last_kernel_; //!< Last submitted kernel
Pal::IQueue* iQueue_; //!< PAL queue object
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
const amd::Kernel* last_kernel_; //!< Last submitted kernel
private:
private:
void DumpMemoryReferences() const;
const VirtualGPU& gpu_; //!< OCL virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
const VirtualGPU& gpu_; //!< OCL virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
Util::VirtualLinearAllocator vlAlloc_;
std::vector<Pal::GpuMemoryRef> palMemRefs_;
std::vector<Pal::IGpuMemory*> palMems_;
std::vector<Pal::DoppRef> palDoppRefs_;
std::set<Pal::IGpuMemory*> sdiReferences_;
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
uint64_t residency_limit_; //!< Enables residency limit
Util::VirtualLinearAllocator vlAlloc_;
std::vector<Pal::GpuMemoryRef> palMemRefs_;
std::vector<Pal::IGpuMemory*> palMems_;
std::vector<Pal::DoppRef> palDoppRefs_;
std::set<Pal::IGpuMemory*> sdiReferences_;
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
uint64_t residency_limit_; //!< Enables residency limit
uint max_command_buffers_;
};
@@ -185,14 +183,14 @@ class VirtualGPU : public device::VirtualDevice {
CommandBatch(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
) {
) {
init(head, events, lastTS);
}
void init(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
) {
) {
head_ = head;
lastTS_ = lastTS;
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
@@ -202,11 +200,11 @@ class VirtualGPU : public device::VirtualDevice {
//! The virtual GPU states
union State {
struct {
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
};
uint value_;
State() : value_(0) {}
@@ -259,13 +257,13 @@ class VirtualGPU : public device::VirtualDevice {
void findSplitSize(const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
);
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
);
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
@@ -301,7 +299,7 @@ class VirtualGPU : public device::VirtualDevice {
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
uint32_t sharedMemBytes = 0 //!< Shared memory size
);
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -331,20 +329,20 @@ class VirtualGPU : public device::VirtualDevice {
//! Set the last known GPU event
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
);
);
//! Flush DMA buffer on the specified engine
void flushDMA(uint engineID //!< Engine ID for DMA flush
);
);
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch
);
);
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(CommandBatch* cb //!< Command batch
);
);
//! Returns a resource associated with the constant buffer
const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
@@ -355,7 +353,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
);
//! End the command profiling
void profilingEnd(amd::Command& command);
@@ -363,11 +361,11 @@ class VirtualGPU : public device::VirtualDevice {
//! Collect the profiling results
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
);
);
//! Adds a memory handle into the GSL memory array for Virtual Heap
inline void addVmMemory(const Memory* memory //!< GPU memory object
);
);
//! Adds the last submitted kernel to the queue for tracking a possible hang
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
@@ -377,7 +375,7 @@ class VirtualGPU : public device::VirtualDevice {
void addDoppRef(const Memory* memory, //!< GPU memory object
bool lastDoopCmd, //!< is the last submission for the pre-present primary
bool pfpaDoppCmd //!< is a submission for the pre-present primary
);
);
//! Return xfer buffer for staging operations
XferBuffer& xferWrite() { return writeBuffer_; }
@@ -429,7 +427,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
);
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
@@ -447,7 +445,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
bool flushL2 = false) const {
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -508,7 +507,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns TRUE if SDMA requires overlap synchronizaiton
bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer
const Resource& dst //!< Destination resource for SDMA transfer
);
);
//! Checks if RGP capture is enabled
bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
@@ -519,7 +518,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
);
);
private:
struct MemoryRange {
@@ -537,14 +536,14 @@ class VirtualGPU : public device::VirtualDevice {
//! Awaits a command batch with a waiting event
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = nullptr //!< A waiting event
);
);
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
size_t& ldsAddess //!< Returns LDS size, used in the kernel
);
size_t& ldsAddess //!< Returns LDS size, used in the kernel
);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
@@ -553,7 +552,7 @@ class VirtualGPU : public device::VirtualDevice {
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
);
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
@@ -564,35 +563,36 @@ class VirtualGPU : public device::VirtualDevice {
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
);
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
);
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
);
void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel
VirtualGPU* gpuDefQueue //!< Device queue for children execution
);
);
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
);
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
);
void PostDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU* gpuDefQueue, //!< GPU default queue
uint64_t vmDefQueue, //!< VM handle to the virtual queue
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
);
void PostDeviceEnqueue(
const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU* gpuDefQueue, //!< GPU default queue
uint64_t vmDefQueue, //!< VM handle to the virtual queue
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
);
Device& gpuDevice_; //!< physical GPU device
amd::Monitor execution_; //!< Lock to serialise access to all device objects
@@ -605,11 +605,11 @@ class VirtualGPU : public device::VirtualDevice {
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
ManagedBuffer managedBuffer_; //!< Managed write buffer
constbufs_t constBufs_; //!< constant buffers
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
ManagedBuffer managedBuffer_; //!< Managed write buffer
constbufs_t constBufs_; //!< constant buffers
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
typedef std::queue<CommandBatch*> CommandBatchQueue;
CommandBatchQueue cbQueue_; //!< Queue of command batches
@@ -617,12 +617,12 @@ class VirtualGPU : public device::VirtualDevice {
uint hwRing_; //!< HW ring used on this virtual device
State state_; //!< virtual GPU current state
State state_; //!< virtual GPU current state
GpuEvent events_[AllEngines]; //!< Last known GPU events
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
TimeStamp* profileTs_; //!< current profiling timestamp for command
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
TimeStamp* profileTs_; //!< current profiling timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
@@ -645,8 +645,7 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
queues_[MainEngine]->last_kernel_ = &kernel;
}
template <bool avoidBarrierSubmit>
uint VirtualGPU::Queue::submit(bool forceFlush) {
template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
cmdCnt_++;
uint id = cmdBufIdCurrent_;
bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
@@ -659,32 +658,30 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
}
template <typename T>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
memcpy(dst + offset, src, size);
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
}
/*@}*/} // namespace pal
/*@}*/ // namespace pal
} // namespace pal