SWDEV-404889 - Inital change for debugger support
- Program unique AQL index for debugger. The logic manages AQL array of packets per HW queue. - Provide debug state to PAL Change-Id: I38fa1f5435fa711fd1d44dc391f2e61eb2a25efa
Этот коммит содержится в:
коммит произвёл
German Andryeyev
родитель
6f5277c701
Коммит
d97cc0abbd
@@ -64,6 +64,8 @@
|
||||
#include "protocols/driverControlServer.h"
|
||||
#endif // PAL_GPUOPEN_OCL
|
||||
|
||||
extern struct r_debug* _amdgpu_r_debug_ptr;
|
||||
|
||||
namespace {
|
||||
|
||||
//! Define the mapping from PAL asic revision enumeration values to the
|
||||
@@ -1142,6 +1144,15 @@ bool Device::initializeHeapResources() {
|
||||
if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
|
||||
return false;
|
||||
}
|
||||
#ifdef PAL_DEBUGGER
|
||||
Pal::RuntimeSetup setup;
|
||||
setup.r_debug = reinterpret_cast<uint64_t>(_amdgpu_r_debug_ptr);
|
||||
if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) {
|
||||
LogError("Couldn't register debug state from the loader!");
|
||||
// Note: ignore debug state error, since it's not a critical
|
||||
// error for the execution
|
||||
}
|
||||
#endif
|
||||
|
||||
heapInitComplete_ = true;
|
||||
|
||||
@@ -1391,7 +1402,6 @@ void Device::tearDown() {
|
||||
delete platformObj_;
|
||||
platform_ = nullptr;
|
||||
}
|
||||
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
if (compiler_ != nullptr) {
|
||||
amd::Hsail::CompilerFini(compiler_);
|
||||
@@ -2595,6 +2605,7 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::DestroyExtSemaphore(void* extSemaphore) {
|
||||
Pal::IQueueSemaphore* sem = reinterpret_cast<Pal::IQueueSemaphore*>(extSemaphore);
|
||||
sem->Destroy();
|
||||
|
||||
@@ -231,11 +231,13 @@ class Sampler : public device::Sampler {
|
||||
class Device : public NullDevice {
|
||||
public:
|
||||
struct QueueRecycleInfo : public amd::HeapObject {
|
||||
int counter_; //!< Lock usage counter
|
||||
Pal::EngineType engineType_; //!< Engine type
|
||||
uint32_t index_; //!< HW queue index for scratch buffer access
|
||||
amd::Monitor queue_lock_; //!< Queue lock for access
|
||||
QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0) {}
|
||||
int counter_; //!< Lock usage counter
|
||||
Pal::EngineType engineType_; //!< Engine type
|
||||
uint32_t index_; //!< HW queue index for scratch buffer access
|
||||
amd::Monitor queue_lock_; //!< Queue lock for access
|
||||
AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support
|
||||
QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0),
|
||||
queue_lock_("Queue lock for sharing", true) {}
|
||||
};
|
||||
|
||||
//! Locks any access to the virtual GPUs
|
||||
|
||||
@@ -265,11 +265,10 @@ const HSAILProgram& HSAILKernel::prog() const {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
||||
const amd::NDRangeContainer& sizes,
|
||||
const_address params,
|
||||
size_t ldsAddress, uint64_t vmDefQueue,
|
||||
uint64_t* vmParentWrap) const {
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
||||
const_address params, size_t ldsAddress, uint64_t vmDefQueue,
|
||||
uint64_t* vmParentWrap, uint32_t* aql_index) const {
|
||||
// Provide private and local heap addresses
|
||||
static constexpr uint AddressShift = LP64_SWITCH(0, 32);
|
||||
const_address parameters = params;
|
||||
@@ -451,9 +450,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
signature.paramsSize()));
|
||||
}
|
||||
|
||||
// hsa_kernel_dispatch_packet_t disp;
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp =
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
|
||||
|
||||
constexpr uint16_t kDispatchPacketHeader =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
|
||||
@@ -109,7 +109,8 @@ class HSAILKernel : public device::Kernel {
|
||||
const_address params, //!< Application arguments for the kernel
|
||||
size_t ldsAddress, //!< LDS address that includes all arguments.
|
||||
uint64_t vmDefQueue, //!< GPU VM default queue pointer
|
||||
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
|
||||
uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object
|
||||
uint32_t* aql_index //!< AQL packet index in the packets array for debugger
|
||||
) const;
|
||||
|
||||
//! Returns the kernel index in the program
|
||||
|
||||
@@ -242,6 +242,14 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
|
||||
return vec;
|
||||
}
|
||||
|
||||
inline static std::string GetUriFromMemoryAddress(const void* memory, size_t size) {
|
||||
int pid = amd::Os::getProcessId();
|
||||
std::ostringstream uri_stream;
|
||||
uri_stream << "memory://" << pid << "#offset=0x" << std::hex <<
|
||||
reinterpret_cast<uintptr_t>(memory) << std::dec << "&size=" << size;
|
||||
return uri_stream.str();
|
||||
}
|
||||
|
||||
bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
@@ -256,7 +264,8 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
|
||||
code_object.handle = reinterpret_cast<uint64_t>(binary);
|
||||
|
||||
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
||||
auto uri = GetUriFromMemoryAddress(binary, binSize);
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
||||
return false;
|
||||
@@ -762,8 +771,8 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
|
||||
code_object.handle = reinterpret_cast<uint64_t>(binary);
|
||||
|
||||
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
||||
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
||||
auto uri = GetUriFromMemoryAddress(binary, binSize);
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogError("Error: AMD HSA Code Object loading failed.");
|
||||
return false;
|
||||
|
||||
@@ -151,7 +151,14 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
// Create PAL queue object
|
||||
if (index < GPU_MAX_HW_QUEUES) {
|
||||
Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
|
||||
if (info == nullptr) {
|
||||
LogError("Could not create QueueRecycleInfo!");
|
||||
return nullptr;
|
||||
}
|
||||
addrQ = reinterpret_cast<address>(&info[1]);
|
||||
#ifdef PAL_DEBUGGER
|
||||
qCreateInfo.aqlPacketList = info->AqlPacketList();
|
||||
#endif
|
||||
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
|
||||
if (result == Pal::Result::Success) {
|
||||
const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
|
||||
@@ -183,11 +190,22 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
|
||||
}
|
||||
Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
|
||||
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
|
||||
queue->lock_ = &info->queue_lock_;
|
||||
addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
} else {
|
||||
Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
|
||||
if (info == nullptr) {
|
||||
LogError("Could not create QueueRecycleInfo!");
|
||||
return nullptr;
|
||||
}
|
||||
queue->info_ = info;
|
||||
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
|
||||
// Exclusive compute path
|
||||
addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
#ifdef PAL_DEBUGGER
|
||||
qCreateInfo.aqlPacketList = info->AqlPacketList();
|
||||
#endif
|
||||
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
|
||||
}
|
||||
if (result != Pal::Result::Success) {
|
||||
@@ -226,6 +244,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
}
|
||||
|
||||
VirtualGPU::Queue::~Queue() {
|
||||
delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);
|
||||
|
||||
if (nullptr != iQueue_) {
|
||||
// Make sure the queues are idle
|
||||
// It's unclear why PAL could still have a busy queue
|
||||
@@ -349,6 +369,8 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::Queue::flush() {
|
||||
amd::ScopedLock l(lock_);
|
||||
|
||||
if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
|
||||
if (Pal::Result::Success !=
|
||||
iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
|
||||
@@ -398,10 +420,8 @@ bool VirtualGPU::Queue::flush() {
|
||||
// Submit command buffer to OS
|
||||
Pal::Result result;
|
||||
if (gpu_.rgpCaptureEna()) {
|
||||
amd::ScopedLock l(lock_);
|
||||
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
|
||||
} else {
|
||||
amd::ScopedLock l(lock_);
|
||||
result = iQueue_->Submit(submitInfo);
|
||||
}
|
||||
if (Pal::Result::Success != result) {
|
||||
@@ -475,7 +495,9 @@ bool VirtualGPU::Queue::flush() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::Queue::waitForEvent(uint id) {
|
||||
amd::ScopedLock l(lock_);
|
||||
if (isDone(id)) {
|
||||
return true;
|
||||
}
|
||||
@@ -492,7 +514,9 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::Queue::isDone(uint id) {
|
||||
amd::ScopedLock l(lock_);
|
||||
if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
|
||||
return true;
|
||||
}
|
||||
@@ -512,6 +536,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::Queue::DumpMemoryReferences() const {
|
||||
std::fstream dump;
|
||||
std::stringstream file_name("ocl_hang_dump.txt");
|
||||
@@ -1079,6 +1104,14 @@ VirtualGPU::~VirtualGPU() {
|
||||
amd::ScopedLock k(dev().lockAsyncOps());
|
||||
amd::ScopedLock lock(dev().vgpusAccess());
|
||||
|
||||
// Clear all timestamps, associated with this virtual GPU
|
||||
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
|
||||
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
|
||||
if (mgmt.aql_vgpus_[i] == this) {
|
||||
mgmt.aql_vgpus_[i] = nullptr;
|
||||
mgmt.aql_events_[i].invalidate();
|
||||
}
|
||||
}
|
||||
// Destroy RGP trace
|
||||
if (rgpCaptureEna()) {
|
||||
dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
|
||||
@@ -2661,9 +2694,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
|
||||
uint64_t vmParentWrap = 0;
|
||||
uint32_t aql_index = 0;
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
|
||||
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
|
||||
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
|
||||
if (nullptr == aqlPkt) {
|
||||
LogError("Couldn't load kernel arguments");
|
||||
return false;
|
||||
@@ -2684,6 +2718,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
#ifdef PAL_DEBUGGER
|
||||
dispatchParam.aqlPacketIndex = aql_index;
|
||||
#endif
|
||||
// Run AQL dispatch in HW
|
||||
eventBegin(MainEngine);
|
||||
iCmd()->CmdDispatchAql(dispatchParam);
|
||||
@@ -2692,6 +2729,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
LogError("Something is wrong. ID mismatch!\n");
|
||||
}
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
AqlPacketUpdateTs(aql_index, gpuEvent);
|
||||
|
||||
// Execute scheduler for device enqueue
|
||||
if (hsaKernel.dynamicParallelism()) {
|
||||
@@ -2730,6 +2768,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
@@ -53,6 +53,22 @@ class BlitManager;
|
||||
class ThreadTrace;
|
||||
class HSAILKernel;
|
||||
|
||||
struct AqlPacketMgmt : public amd::EmbeddedObject {
|
||||
static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
|
||||
AqlPacketMgmt()
|
||||
: packet_index_(0) {
|
||||
memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
|
||||
}
|
||||
|
||||
//! Returns the aql packet list
|
||||
uintptr_t AqlPacketList() const { return reinterpret_cast<uintptr_t>(&aql_packets_); }
|
||||
|
||||
hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets
|
||||
GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet
|
||||
VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions
|
||||
std::atomic<uint64_t> packet_index_; //!< The active packet slot index
|
||||
};
|
||||
|
||||
//! Virtual GPU
|
||||
class VirtualGPU : public device::VirtualDevice {
|
||||
public:
|
||||
@@ -77,8 +93,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
|
||||
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
|
||||
uint max_command_buffers)
|
||||
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
|
||||
: lock_(nullptr),
|
||||
iQueue_(nullptr),
|
||||
iCmdBuffs_(max_command_buffers, nullptr),
|
||||
@@ -173,6 +188,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
|
||||
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
|
||||
const amd::Kernel* last_kernel_; //!< Last submitted kernel
|
||||
AqlPacketMgmt* aql_mgmt_; //!< AQL packet emulation managment
|
||||
void* info_ = nullptr; //!< Queue info for RT queues
|
||||
|
||||
private:
|
||||
void DumpMemoryReferences() const;
|
||||
@@ -273,7 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
||||
};
|
||||
|
||||
|
||||
class DmaFlushMgmt : public amd::EmbeddedObject {
|
||||
public:
|
||||
DmaFlushMgmt(const Device& dev);
|
||||
@@ -402,8 +418,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
);
|
||||
|
||||
//! Embeds memory handle info into the CB associated with this VGPU
|
||||
inline void logVmMemory(const std::string name, //!< Brief description of the memory object
|
||||
const Memory* memory //!< GPU memory object
|
||||
inline void logVmMemory(const std::string name, //!< Brief description of the memory object
|
||||
const Memory* memory //!< GPU memory object
|
||||
);
|
||||
|
||||
//! Adds a memory handle into the PAL memory array for Virtual Heap
|
||||
@@ -412,11 +428,11 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
//! Adds the last submitted kernel to the queue for tracking a possible hang
|
||||
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
|
||||
) const;
|
||||
) const;
|
||||
|
||||
//! Checks if runtime dispatches the same kernel as previously
|
||||
inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object
|
||||
) const;
|
||||
) const;
|
||||
|
||||
//! Adds a dopp desktop texture reference
|
||||
void addDoppRef(const Memory* memory, //!< GPU memory object
|
||||
@@ -494,12 +510,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
barrier.pPipePoints = &point;
|
||||
barrier.transitionCount = 1;
|
||||
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
|
||||
Pal::BarrierTransition trans = {cacheMask,
|
||||
cacheMask,
|
||||
{nullptr,
|
||||
{{0, 0, 0}, 0, 0, 0},
|
||||
Pal::LayoutShaderRead,
|
||||
Pal::LayoutShaderRead}};
|
||||
Pal::BarrierTransition trans = {
|
||||
cacheMask,
|
||||
cacheMask,
|
||||
{nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
barrier.pTransitions = &trans;
|
||||
barrier.waitPoint = Pal::HwPipePreCs;
|
||||
barrier.reason = static_cast<uint32_t>(reason);
|
||||
@@ -578,6 +592,25 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
}
|
||||
}
|
||||
|
||||
//! Updates timestamp for AQL packet index
|
||||
void AqlPacketUpdateTs(uint32_t index, GpuEvent gpu_event) {
|
||||
// Save the new CB ID for this slot
|
||||
queues_[MainEngine]->aql_mgmt_->aql_events_[index] = gpu_event;
|
||||
queues_[MainEngine]->aql_mgmt_->aql_vgpus_[index] = this;
|
||||
}
|
||||
|
||||
//! Returns the current active slot for AQL packet
|
||||
hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
|
||||
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
|
||||
// Atomic increment global AQL index and wrap around max AQL list size
|
||||
*index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
|
||||
if (mgmt.aql_events_[*index].isValid()) {
|
||||
// Make sure GPU doesn't process this slot
|
||||
mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
|
||||
}
|
||||
return &mgmt.aql_packets_[*index];
|
||||
}
|
||||
|
||||
protected:
|
||||
void profileEvent(EngineType engine, bool type) const;
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user