SWDEV-301667 - Create TS for each node recorded in graph

- Create a vector to allow multiple TS to be stored in Command.
- This would mean we dont wait for entire batch in Accumulate command
to finish when we exhaust signals.
- Reduce the number of signals created at init to 64. This min value
may still need to be tuned but the KFD allows max of 4094 interrupt
signals per device.
- Store kernel names whenever they are available and not just when
profiling. If we dynamically enable profiling like for Torch, a crash
can happen if hipGraphInstantiate wasnt included in Torch profile scope
beacuse we previously entered kernel names only when profiler is
attached.

Change-Id: I34e7881a25bbc763f82fdeb3408a8ea58e1ec006


[ROCm/clr commit: c157bfb202]
Этот коммит содержится в:
Saleel Kudchadker
2024-03-21 18:26:01 +00:00
родитель 150144a05f
Коммит f3aedfbec0
8 изменённых файлов: 63 добавлений и 71 удалений
+1 -1
Просмотреть файл
@@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
// initially set the status of fence as queued
clglEvent->setStatus(CL_SUBMITTED);
// store GLsync id of the fence in event in order to associate them together
clglEvent->setData(clGLsync);
clglEvent->data().emplace_back(clGLsync);
amd::Event* evt = clglEvent;
evt->retain();
*not_null(errcode_ret) = CL_SUCCESS;
+9 -6
Просмотреть файл
@@ -3317,7 +3317,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
return;
}
// Save the TimeStamp object in the current OCL event
command.setData(ts);
command.data().emplace_back(ts);
profileTs_ = ts;
state_.profileEnabled_ = true;
}
@@ -3325,7 +3325,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
void VirtualGPU::profilingEnd(amd::Command& command) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
TimeStamp* ts = !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back())
: nullptr;
if (ts != nullptr) {
// Check if the command actually did any GPU submission
if (ts->isValid()) {
@@ -3333,7 +3334,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
} else {
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
command.setData(nullptr);
command.data().clear();
}
}
}
@@ -3362,7 +3363,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
: nullptr;
if (ts != nullptr) {
ts->value(&startTimeStamp, &endTimeStamp);
@@ -3379,7 +3381,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
: nullptr;
current = first->getNext();
@@ -3389,7 +3392,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
startTimeStamp -= readjustTimeGPU_;
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
first->setData(nullptr);
first->data().clear();
} else {
// For empty commands start/end is equal to
// the end of the last valid command
+42 -46
Просмотреть файл
@@ -194,23 +194,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
head = ts->command().GetBatchHead();
}
while (head != nullptr) {
if (head->data() != nullptr) {
Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data());
ts->setParsedCommand(head);
for (auto it : headTs->Signals()) {
hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
&HsaAmdSignalHandler, ts);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
"handle(0x%lx)", static_cast<uint32_t>(val), headTs,
headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
if (!head->data().empty()) {
for (auto i = 0; i < head->data().size(); i++) {
Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data()[i]);
ts->setParsedCommand(head);
for (auto it : headTs->Signals()) {
hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
&HsaAmdSignalHandler, ts);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
"handle(0x%lx)", static_cast<uint32_t>(val), headTs,
headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
}
return false;
}
return false;
}
}
}
@@ -356,9 +358,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
// ================================================================================================
bool VirtualGPU::HwQueueTracker::Create() {
uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) {
kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki;
}
signal_list_.resize(kSignalListSize);
hsa_agent_t agent = gpu_.gpu_device();
@@ -981,14 +981,14 @@ bool VirtualGPU::dispatchAqlPacket(
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
amd::ScopedLock lock(execution());
if (vcmd != nullptr) {
profilingBegin(*vcmd, true, true);
profilingBegin(*vcmd, true);
}
dispatchBlockingWait();
constexpr size_t kPacketSize = 1;
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
if (vcmd != nullptr) {
profilingEnd(*vcmd, true);
profilingEnd(*vcmd);
}
return true;
}
@@ -1455,23 +1455,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
* and then calls start() to get the current host timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
if (command.profilingInfo().enabled_) {
if (timestamp_ != nullptr) {
LogWarning("Trying to create a second timestamp in VirtualGPU. \
This could have unintended consequences.");
return;
}
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
if (ts == nullptr) {
// Without barrier profiling will wait for each individual signal
timestamp_ = new Timestamp(this, command);
command.setData(timestamp_);
timestamp_->start();
} else {
timestamp_ = ts;
}
// Without barrier profiling will wait for each individual signal
timestamp_ = new Timestamp(this, command);
command.data().emplace_back(timestamp_);
timestamp_->start();
// Enable SDMA profiling on the first access if profiling is set
// Its not per command basis
@@ -1504,11 +1499,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available.
*/
void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
void VirtualGPU::profilingEnd(amd::Command& command) {
if (command.profilingInfo().enabled_) {
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
if (ts->HwProfiling() == false) {
ts->end();
if (timestamp_->HwProfiling() == false) {
timestamp_->end();
}
timestamp_ = nullptr;
}
@@ -1541,8 +1535,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
// came before it to start and end with this first valid start time.
current = list;
while (current != nullptr) {
if (current->data() != nullptr) {
ts = reinterpret_cast<Timestamp*>(current->data());
if (!current->data().empty()) {
ts = reinterpret_cast<Timestamp*>(current->data().back());
ts->getTime(&startTimeStamp, &endTimeStamp);
break;
}
@@ -1564,13 +1558,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
current = list;
while (current != nullptr) {
if (current->profilingInfo().enabled_) {
if (current->data() != nullptr) {
// Since this is a valid command to get a timestamp, we use the
// timestamp provided by the runtime (saved in the data())
ts = reinterpret_cast<Timestamp*>(current->data());
ts->getTime(&startTimeStamp, &endTimeStamp);
ts->release();
current->setData(nullptr);
if (!current->data().empty()) {
for (auto i = 0; i < current->data().size(); i++) {
// Since this is a valid command to get a timestamp, we use the
// timestamp provided by the runtime (saved in the data())
ts = reinterpret_cast<Timestamp*>(current->data()[i]);
ts->getTime(&startTimeStamp, &endTimeStamp);
ts->release();
}
current->data().clear();
} else {
// If we don't have a command that contains a valid timestamp,
// we simply use the end timestamp of the previous command.
@@ -3467,7 +3463,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true, true);
profilingBegin(vcmd, true);
uint8_t* aqlPacket = vcmd.getLastPacket();
if (aqlPacket != nullptr) {
@@ -3489,7 +3485,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
}
}
profilingEnd(vcmd, true);
profilingEnd(vcmd);
}
// ================================================================================================
+2 -2
Просмотреть файл
@@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice {
bool create();
const Device& dev() const { return roc_device_; }
void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
void profilingEnd(amd::Command& command, bool useCommandTs = false);
void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
void profilingEnd(amd::Command& command);
void updateCommandsState(amd::Command* list) const;
-1
Просмотреть файл
@@ -317,7 +317,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
queue_(&queue),
next_(nullptr),
type_(type),
data_(nullptr),
waitingEvent_(waitingEvent),
eventWaitList_(eventWaitList),
commandWaitBits_(commandWaitBits) {
+6 -13
Просмотреть файл
@@ -258,7 +258,7 @@ class Command : public Event {
Command* next_; //!< Next GPU command in the queue list
Command* batch_head_ = nullptr; //!< The head of the batch commands
cl_command_type type_; //!< This command's OpenCL type.
void* data_;
std::vector<void*> data_;
const Event* waitingEvent_; //!< Waiting event associated with the marker
protected:
@@ -282,7 +282,6 @@ class Command : public Event {
queue_(nullptr),
next_(nullptr),
type_(type),
data_(nullptr),
waitingEvent_(nullptr),
eventWaitList_(nullWaitList),
commandWaitBits_(0) {}
@@ -322,11 +321,9 @@ class Command : public Event {
//! Return this command's OpenCL type.
cl_command_type type() const { return type_; }
//! Return the opaque, device specific data for this command.
void* data() const { return data_; }
//! Return the opaque, device specific data vector for this command.
std::vector<void*>& data() { return data_; }
//! Set the opaque, device specific data for this command.
void setData(void* data) { data_ = data; }
/*! \brief The execution engine for this command.
*
@@ -1273,17 +1270,13 @@ class AccumulateCommand : public Command {
//! Add kernel name to the list if available
void addKernelName(const std::string& kernelName) {
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
// "^" is to indicate kernel is captured at instantiate
kernelNames_.push_back("^ " + kernelName);
}
// "^" is to indicate kernel is captured at instantiate
kernelNames_.push_back("^ " + kernelName);
}
//! Add kernel timestamp to the list if available
void addTimestamps(uint64_t startTs, uint64_t endTs) {
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
tsList_.push_back(std::make_pair(startTs, endTs));
}
tsList_.push_back(std::make_pair(startTs, endTs));
}
//! Return the kernel names
+2 -1
Просмотреть файл
@@ -51,7 +51,8 @@
bool amd::ClGlEvent::waitForFence() {
GLenum ret;
// get fence id associated with fence event
GLsync gs = reinterpret_cast<GLsync>(command().data());
GLsync gs = !command().data().empty() ? reinterpret_cast<GLsync>(command().data().back())
: nullptr;
if (!gs) return false;
// Try to use DC and GLRC of current thread, if it doesn't exist
+1 -1
Просмотреть файл
@@ -219,7 +219,7 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024, \
"The minimum size in KB for P2P transfer with SDMA") \
release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
"AQL queue size in AQL packets") \
release(uint, ROC_SIGNAL_POOL_SIZE, 4096, \
release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
"Initial size of HSA signal pool") \
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
"Limit the number of workgroups in blit operations") \