SWDEV-404889 - Inital change for debugger support

- Program unique AQL index for debugger. The logic manages AQL array of packets per HW queue.
- Provide debug state to PAL

Change-Id: I38fa1f5435fa711fd1d44dc391f2e61eb2a25efa
Этот коммит содержится в:
German
2023-08-17 16:17:23 -04:00
коммит произвёл German Andryeyev
родитель 6f5277c701
Коммит d97cc0abbd
7 изменённых файлов: 126 добавлений и 34 удалений
+12 -1
Просмотреть файл
@@ -64,6 +64,8 @@
#include "protocols/driverControlServer.h"
#endif // PAL_GPUOPEN_OCL
extern struct r_debug* _amdgpu_r_debug_ptr;
namespace {
//! Define the mapping from PAL asic revision enumeration values to the
@@ -1142,6 +1144,15 @@ bool Device::initializeHeapResources() {
if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
return false;
}
#ifdef PAL_DEBUGGER
Pal::RuntimeSetup setup;
setup.r_debug = reinterpret_cast<uint64_t>(_amdgpu_r_debug_ptr);
if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) {
LogError("Couldn't register debug state from the loader!");
// Note: ignore debug state error, since it's not a critical
// error for the execution
}
#endif
heapInitComplete_ = true;
@@ -1391,7 +1402,6 @@ void Device::tearDown() {
delete platformObj_;
platform_ = nullptr;
}
#if defined(WITH_COMPILER_LIB)
if (compiler_ != nullptr) {
amd::Hsail::CompilerFini(compiler_);
@@ -2595,6 +2605,7 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha
return true;
}
// ================================================================================================
void Device::DestroyExtSemaphore(void* extSemaphore) {
Pal::IQueueSemaphore* sem = reinterpret_cast<Pal::IQueueSemaphore*>(extSemaphore);
sem->Destroy();
+7 -5
Просмотреть файл
@@ -231,11 +231,13 @@ class Sampler : public device::Sampler {
class Device : public NullDevice {
public:
struct QueueRecycleInfo : public amd::HeapObject {
int counter_; //!< Lock usage counter
Pal::EngineType engineType_; //!< Engine type
uint32_t index_; //!< HW queue index for scratch buffer access
amd::Monitor queue_lock_; //!< Queue lock for access
QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0) {}
int counter_; //!< Lock usage counter
Pal::EngineType engineType_; //!< Engine type
uint32_t index_; //!< HW queue index for scratch buffer access
amd::Monitor queue_lock_; //!< Queue lock for access
AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support
QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0),
queue_lock_("Queue lock for sharing", true) {}
};
//! Locks any access to the virtual GPUs
+5 -8
Просмотреть файл
@@ -265,11 +265,10 @@ const HSAILProgram& HSAILKernel::prog() const {
}
// ================================================================================================
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
const amd::NDRangeContainer& sizes,
const_address params,
size_t ldsAddress, uint64_t vmDefQueue,
uint64_t* vmParentWrap) const {
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address params, size_t ldsAddress, uint64_t vmDefQueue,
uint64_t* vmParentWrap, uint32_t* aql_index) const {
// Provide private and local heap addresses
static constexpr uint AddressShift = LP64_SWITCH(0, 32);
const_address parameters = params;
@@ -451,9 +450,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
signature.paramsSize()));
}
// hsa_kernel_dispatch_packet_t disp;
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
constexpr uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+2 -1
Просмотреть файл
@@ -109,7 +109,8 @@ class HSAILKernel : public device::Kernel {
const_address params, //!< Application arguments for the kernel
size_t ldsAddress, //!< LDS address that includes all arguments.
uint64_t vmDefQueue, //!< GPU VM default queue pointer
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object
uint32_t* aql_index //!< AQL packet index in the packets array for debugger
) const;
//! Returns the kernel index in the program
+12 -3
Просмотреть файл
@@ -242,6 +242,14 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
return vec;
}
inline static std::string GetUriFromMemoryAddress(const void* memory, size_t size) {
int pid = amd::Os::getProcessId();
std::ostringstream uri_stream;
uri_stream << "memory://" << pid << "#offset=0x" << std::hex <<
reinterpret_cast<uintptr_t>(memory) << std::dec << "&size=" << size;
return uri_stream.str();
}
bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
bool internalKernel) {
#if defined(WITH_COMPILER_LIB)
@@ -256,7 +264,8 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
code_object.handle = reinterpret_cast<uint64_t>(binary);
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
auto uri = GetUriFromMemoryAddress(binary, binSize);
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
if (status != HSA_STATUS_SUCCESS) {
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
return false;
@@ -762,8 +771,8 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
code_object.handle = reinterpret_cast<uint64_t>(binary);
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
auto uri = GetUriFromMemoryAddress(binary, binSize);
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error: AMD HSA Code Object loading failed.");
return false;
+42 -3
Просмотреть файл
@@ -151,7 +151,14 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
// Create PAL queue object
if (index < GPU_MAX_HW_QUEUES) {
Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
if (info == nullptr) {
LogError("Could not create QueueRecycleInfo!");
return nullptr;
}
addrQ = reinterpret_cast<address>(&info[1]);
#ifdef PAL_DEBUGGER
qCreateInfo.aqlPacketList = info->AqlPacketList();
#endif
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
if (result == Pal::Result::Success) {
const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
@@ -183,11 +190,22 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
}
Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
queue->lock_ = &info->queue_lock_;
addrQ = reinterpret_cast<address>(&queue[1]);
} else {
Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
if (info == nullptr) {
LogError("Could not create QueueRecycleInfo!");
return nullptr;
}
queue->info_ = info;
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
// Exclusive compute path
addrQ = reinterpret_cast<address>(&queue[1]);
#ifdef PAL_DEBUGGER
qCreateInfo.aqlPacketList = info->AqlPacketList();
#endif
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
}
if (result != Pal::Result::Success) {
@@ -226,6 +244,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
}
VirtualGPU::Queue::~Queue() {
delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);
if (nullptr != iQueue_) {
// Make sure the queues are idle
// It's unclear why PAL could still have a busy queue
@@ -349,6 +369,8 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
// ================================================================================================
bool VirtualGPU::Queue::flush() {
amd::ScopedLock l(lock_);
if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
if (Pal::Result::Success !=
iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
@@ -398,10 +420,8 @@ bool VirtualGPU::Queue::flush() {
// Submit command buffer to OS
Pal::Result result;
if (gpu_.rgpCaptureEna()) {
amd::ScopedLock l(lock_);
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
} else {
amd::ScopedLock l(lock_);
result = iQueue_->Submit(submitInfo);
}
if (Pal::Result::Success != result) {
@@ -475,7 +495,9 @@ bool VirtualGPU::Queue::flush() {
return true;
}
// ================================================================================================
bool VirtualGPU::Queue::waitForEvent(uint id) {
amd::ScopedLock l(lock_);
if (isDone(id)) {
return true;
}
@@ -492,7 +514,9 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
return result;
}
// ================================================================================================
bool VirtualGPU::Queue::isDone(uint id) {
amd::ScopedLock l(lock_);
if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
return true;
}
@@ -512,6 +536,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
return true;
}
// ================================================================================================
void VirtualGPU::Queue::DumpMemoryReferences() const {
std::fstream dump;
std::stringstream file_name("ocl_hang_dump.txt");
@@ -1079,6 +1104,14 @@ VirtualGPU::~VirtualGPU() {
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
// Clear all timestamps, associated with this virtual GPU
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
if (mgmt.aql_vgpus_[i] == this) {
mgmt.aql_vgpus_[i] = nullptr;
mgmt.aql_events_[i].invalidate();
}
}
// Destroy RGP trace
if (rgpCaptureEna()) {
dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
@@ -2661,9 +2694,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
uint64_t vmParentWrap = 0;
uint32_t aql_index = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2684,6 +2718,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
#ifdef PAL_DEBUGGER
dispatchParam.aqlPacketIndex = aql_index;
#endif
// Run AQL dispatch in HW
eventBegin(MainEngine);
iCmd()->CmdDispatchAql(dispatchParam);
@@ -2692,6 +2729,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, gpuEvent);
AqlPacketUpdateTs(aql_index, gpuEvent);
// Execute scheduler for device enqueue
if (hsaKernel.dynamicParallelism()) {
@@ -2730,6 +2768,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return true;
}
// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
+46 -13
Просмотреть файл
@@ -53,6 +53,22 @@ class BlitManager;
class ThreadTrace;
class HSAILKernel;
struct AqlPacketMgmt : public amd::EmbeddedObject {
static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
AqlPacketMgmt()
: packet_index_(0) {
memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
}
//! Returns the aql packet list
uintptr_t AqlPacketList() const { return reinterpret_cast<uintptr_t>(&aql_packets_); }
hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets
GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet
VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions
std::atomic<uint64_t> packet_index_; //!< The active packet slot index
};
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice {
public:
@@ -77,8 +93,7 @@ class VirtualGPU : public device::VirtualDevice {
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
uint max_command_buffers)
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
: lock_(nullptr),
iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
@@ -173,6 +188,8 @@ class VirtualGPU : public device::VirtualDevice {
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
const amd::Kernel* last_kernel_; //!< Last submitted kernel
AqlPacketMgmt* aql_mgmt_; //!< AQL packet emulation managment
void* info_ = nullptr; //!< Queue info for RT queues
private:
void DumpMemoryReferences() const;
@@ -273,7 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class DmaFlushMgmt : public amd::EmbeddedObject {
public:
DmaFlushMgmt(const Device& dev);
@@ -402,8 +418,8 @@ class VirtualGPU : public device::VirtualDevice {
);
//! Embeds memory handle info into the CB associated with this VGPU
inline void logVmMemory(const std::string name, //!< Brief description of the memory object
const Memory* memory //!< GPU memory object
inline void logVmMemory(const std::string name, //!< Brief description of the memory object
const Memory* memory //!< GPU memory object
);
//! Adds a memory handle into the PAL memory array for Virtual Heap
@@ -412,11 +428,11 @@ class VirtualGPU : public device::VirtualDevice {
//! Adds the last submitted kernel to the queue for tracking a possible hang
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
) const;
) const;
//! Checks if runtime dispatches the same kernel as previously
inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object
) const;
) const;
//! Adds a dopp desktop texture reference
void addDoppRef(const Memory* memory, //!< GPU memory object
@@ -494,12 +510,10 @@ class VirtualGPU : public device::VirtualDevice {
barrier.pPipePoints = &point;
barrier.transitionCount = 1;
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
Pal::BarrierTransition trans = {cacheMask,
cacheMask,
{nullptr,
{{0, 0, 0}, 0, 0, 0},
Pal::LayoutShaderRead,
Pal::LayoutShaderRead}};
Pal::BarrierTransition trans = {
cacheMask,
cacheMask,
{nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
barrier.reason = static_cast<uint32_t>(reason);
@@ -578,6 +592,25 @@ class VirtualGPU : public device::VirtualDevice {
}
}
//! Updates timestamp for AQL packet index
void AqlPacketUpdateTs(uint32_t index, GpuEvent gpu_event) {
// Save the new CB ID for this slot
queues_[MainEngine]->aql_mgmt_->aql_events_[index] = gpu_event;
queues_[MainEngine]->aql_mgmt_->aql_vgpus_[index] = this;
}
//! Returns the current active slot for AQL packet
hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
// Atomic increment global AQL index and wrap around max AQL list size
*index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
if (mgmt.aql_events_[*index].isValid()) {
// Make sure GPU doesn't process this slot
mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
}
return &mgmt.aql_packets_[*index];
}
protected:
void profileEvent(EngineType engine, bool type) const;