SWDEV-283726 - Workaround for rocprof hang
This addresses the rocprof hang seen with direct dispatch. The
workaround queues the handler back if any of the signal value in the batch
is not decremented. To rememmber the last position in the list, we save
the parsed command in the current timestamp struct.
Change-Id: I02959e463cfe3cee83c54808ffd6e6f48f43b4e8
[ROCm/clr commit: e5e635f9bf]
This commit is contained in:
@@ -1192,6 +1192,8 @@ class VirtualDevice : public amd::HeapObject {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); }
|
||||
|
||||
virtual void profilerAttach(bool enable) = 0;
|
||||
//! Get the blit manager object
|
||||
device::BlitManager& blitMgr() const { return *blitMgr_; }
|
||||
|
||||
|
||||
@@ -309,6 +309,9 @@ class VirtualGPU : public device::VirtualDevice, public CALGSLContext {
|
||||
//! End the command profiling
|
||||
void profilingEnd(amd::Command& command);
|
||||
|
||||
//! Attach profiler
|
||||
void profilerAttach(bool enable = false) {}
|
||||
|
||||
//! Collect the profiling results
|
||||
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
|
||||
const amd::Event* waitingEvent //!< Waiting event
|
||||
|
||||
@@ -355,6 +355,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void flush(amd::Command* list = nullptr, bool wait = false);
|
||||
|
||||
void profilerAttach(bool enable = false) {}
|
||||
|
||||
//! Returns GPU device object associated with this kernel
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
|
||||
@@ -147,12 +147,41 @@ void Timestamp::checkGpuTime() {
|
||||
// ================================================================================================
|
||||
bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
Timestamp* ts = reinterpret_cast<Timestamp*>(arg);
|
||||
|
||||
amd::Thread* thread = amd::Thread::current();
|
||||
if (!(thread != nullptr ||
|
||||
((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
|
||||
return false;
|
||||
}
|
||||
amd::ScopedLock sl(ts->gpu()->execution());
|
||||
if (ts->gpu()->isProfilerAttached()) {
|
||||
amd::Command* head = ts->getParsedCommand();
|
||||
if (head == nullptr) {
|
||||
head = ts->command().GetBatchHead();
|
||||
}
|
||||
while (head != nullptr) {
|
||||
if (head->data() != nullptr) {
|
||||
Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data());
|
||||
ts->setParsedCommand(head);
|
||||
for (auto it : headTs->Signals()) {
|
||||
if (int64_t val = hsa_signal_load_relaxed(it->signal_) > 0) {
|
||||
hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
|
||||
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
&HsaAmdSignalHandler, ts);
|
||||
if (HSA_STATUS_SUCCESS != result) {
|
||||
LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
|
||||
"handle(0x%lx)", static_cast<uint32_t>(val), headTs,
|
||||
headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
head = head->getNext();
|
||||
}
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(0x%lx)",
|
||||
static_cast<uint32_t>(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ class Timestamp : public amd::HeapObject {
|
||||
uint64_t end_;
|
||||
VirtualGPU* gpu_; //!< Virtual GPU, associated with this timestamp
|
||||
const amd::Command& command_; //!< Command, associated with this timestamp
|
||||
amd::Command* parsedCommand_; //!< Command down the list, considering command_ as head
|
||||
std::vector<ProfilingSignal*> signals_;
|
||||
|
||||
public:
|
||||
@@ -96,7 +97,8 @@ class Timestamp : public amd::HeapObject {
|
||||
: start_(std::numeric_limits<uint64_t>::max())
|
||||
, end_(0)
|
||||
, gpu_(gpu)
|
||||
, command_(command) {}
|
||||
, command_(command)
|
||||
, parsedCommand_(nullptr) {}
|
||||
|
||||
~Timestamp() {}
|
||||
|
||||
@@ -131,6 +133,12 @@ class Timestamp : public amd::HeapObject {
|
||||
//! Returns amd::command assigned to this timestamp
|
||||
const amd::Command& command() const { return command_; }
|
||||
|
||||
//! Sets the parsed command
|
||||
void setParsedCommand(amd::Command* command) { parsedCommand_ = command; }
|
||||
|
||||
//! Gets the parsed command
|
||||
amd::Command* getParsedCommand() const { return parsedCommand_; }
|
||||
|
||||
//! Returns virtual GPU device, used with this timestamp
|
||||
VirtualGPU* gpu() const { return gpu_; }
|
||||
};
|
||||
@@ -344,6 +352,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
Timestamp* timestamp() const { return timestamp_; }
|
||||
|
||||
void profilerAttach(bool enable = false) { profilerAttached_ = enable; }
|
||||
|
||||
bool isProfilerAttached() { return profilerAttached_; }
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
|
||||
@@ -413,6 +424,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
|
||||
uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql
|
||||
uint32_t tracking_created_ : 1; //!< Enabled if tracking object was properly initialized
|
||||
uint32_t profilerAttached_ : 1; //!< Indicates if profiler is attached
|
||||
};
|
||||
uint32_t state_;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user