SWDEV-301667 - Create TS for each node recorded in graph
- Create a vector to allow multiple TS to be stored in Command.
- This would mean we dont wait for entire batch in Accumulate command
to finish when we exhaust signals.
- Reduce the number of signals created at init to 64. This min value
may still need to be tuned but the KFD allows max of 4094 interrupt
signals per device.
- Store kernel names whenever they are available and not just when
profiling. If we dynamically enable profiling like for Torch, a crash
can happen if hipGraphInstantiate wasnt included in Torch profile scope
beacuse we previously entered kernel names only when profiler is
attached.
Change-Id: I34e7881a25bbc763f82fdeb3408a8ea58e1ec006
[ROCm/clr commit: c157bfb202]
Этот коммит содержится в:
@@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
|
||||
// initially set the status of fence as queued
|
||||
clglEvent->setStatus(CL_SUBMITTED);
|
||||
// store GLsync id of the fence in event in order to associate them together
|
||||
clglEvent->setData(clGLsync);
|
||||
clglEvent->data().emplace_back(clGLsync);
|
||||
amd::Event* evt = clglEvent;
|
||||
evt->retain();
|
||||
*not_null(errcode_ret) = CL_SUCCESS;
|
||||
|
||||
@@ -3317,7 +3317,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
|
||||
return;
|
||||
}
|
||||
// Save the TimeStamp object in the current OCL event
|
||||
command.setData(ts);
|
||||
command.data().emplace_back(ts);
|
||||
profileTs_ = ts;
|
||||
state_.profileEnabled_ = true;
|
||||
}
|
||||
@@ -3325,7 +3325,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
|
||||
|
||||
void VirtualGPU::profilingEnd(amd::Command& command) {
|
||||
// Get the TimeStamp object associated witht the current command
|
||||
TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
|
||||
TimeStamp* ts = !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back())
|
||||
: nullptr;
|
||||
if (ts != nullptr) {
|
||||
// Check if the command actually did any GPU submission
|
||||
if (ts->isValid()) {
|
||||
@@ -3333,7 +3334,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
|
||||
} else {
|
||||
// Destroy the TimeStamp object
|
||||
tsCache_->freeTimeStamp(ts);
|
||||
command.setData(nullptr);
|
||||
command.data().clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3362,7 +3363,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
|
||||
first = cb->head_;
|
||||
while (nullptr != first) {
|
||||
// Get the TimeStamp object associated witht the current command
|
||||
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
|
||||
TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
|
||||
: nullptr;
|
||||
|
||||
if (ts != nullptr) {
|
||||
ts->value(&startTimeStamp, &endTimeStamp);
|
||||
@@ -3379,7 +3381,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
|
||||
first = cb->head_;
|
||||
while (nullptr != first) {
|
||||
// Get the TimeStamp object associated witht the current command
|
||||
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
|
||||
TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
|
||||
: nullptr;
|
||||
|
||||
current = first->getNext();
|
||||
|
||||
@@ -3389,7 +3392,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
|
||||
startTimeStamp -= readjustTimeGPU_;
|
||||
// Destroy the TimeStamp object
|
||||
tsCache_->freeTimeStamp(ts);
|
||||
first->setData(nullptr);
|
||||
first->data().clear();
|
||||
} else {
|
||||
// For empty commands start/end is equal to
|
||||
// the end of the last valid command
|
||||
|
||||
@@ -194,23 +194,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
head = ts->command().GetBatchHead();
|
||||
}
|
||||
while (head != nullptr) {
|
||||
if (head->data() != nullptr) {
|
||||
Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data());
|
||||
ts->setParsedCommand(head);
|
||||
for (auto it : headTs->Signals()) {
|
||||
hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
|
||||
if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
|
||||
hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
|
||||
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
&HsaAmdSignalHandler, ts);
|
||||
if (HSA_STATUS_SUCCESS != result) {
|
||||
LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
|
||||
"handle(0x%lx)", static_cast<uint32_t>(val), headTs,
|
||||
headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
|
||||
if (!head->data().empty()) {
|
||||
for (auto i = 0; i < head->data().size(); i++) {
|
||||
Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data()[i]);
|
||||
ts->setParsedCommand(head);
|
||||
for (auto it : headTs->Signals()) {
|
||||
hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
|
||||
if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
|
||||
hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
|
||||
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
&HsaAmdSignalHandler, ts);
|
||||
if (HSA_STATUS_SUCCESS != result) {
|
||||
LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
|
||||
"handle(0x%lx)", static_cast<uint32_t>(val), headTs,
|
||||
headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -356,9 +358,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::HwQueueTracker::Create() {
|
||||
uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) {
|
||||
kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki;
|
||||
}
|
||||
|
||||
signal_list_.resize(kSignalListSize);
|
||||
|
||||
hsa_agent_t agent = gpu_.gpu_device();
|
||||
@@ -981,14 +981,14 @@ bool VirtualGPU::dispatchAqlPacket(
|
||||
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
|
||||
amd::ScopedLock lock(execution());
|
||||
if (vcmd != nullptr) {
|
||||
profilingBegin(*vcmd, true, true);
|
||||
profilingBegin(*vcmd, true);
|
||||
}
|
||||
dispatchBlockingWait();
|
||||
constexpr size_t kPacketSize = 1;
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
|
||||
if (vcmd != nullptr) {
|
||||
profilingEnd(*vcmd, true);
|
||||
profilingEnd(*vcmd);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -1455,23 +1455,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
|
||||
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
|
||||
* and then calls start() to get the current host timestamp.
|
||||
*/
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
|
||||
if (command.profilingInfo().enabled_) {
|
||||
if (timestamp_ != nullptr) {
|
||||
LogWarning("Trying to create a second timestamp in VirtualGPU. \
|
||||
This could have unintended consequences.");
|
||||
return;
|
||||
}
|
||||
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
|
||||
|
||||
if (ts == nullptr) {
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
timestamp_ = new Timestamp(this, command);
|
||||
command.setData(timestamp_);
|
||||
timestamp_->start();
|
||||
} else {
|
||||
timestamp_ = ts;
|
||||
}
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
timestamp_ = new Timestamp(this, command);
|
||||
command.data().emplace_back(timestamp_);
|
||||
timestamp_->start();
|
||||
|
||||
// Enable SDMA profiling on the first access if profiling is set
|
||||
// Its not per command basis
|
||||
@@ -1504,11 +1499,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool
|
||||
* created for whatever command we are running and calls end() to get the
|
||||
* current host timestamp if no signal is available.
|
||||
*/
|
||||
void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
|
||||
void VirtualGPU::profilingEnd(amd::Command& command) {
|
||||
if (command.profilingInfo().enabled_) {
|
||||
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
|
||||
if (ts->HwProfiling() == false) {
|
||||
ts->end();
|
||||
if (timestamp_->HwProfiling() == false) {
|
||||
timestamp_->end();
|
||||
}
|
||||
timestamp_ = nullptr;
|
||||
}
|
||||
@@ -1541,8 +1535,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
|
||||
// came before it to start and end with this first valid start time.
|
||||
current = list;
|
||||
while (current != nullptr) {
|
||||
if (current->data() != nullptr) {
|
||||
ts = reinterpret_cast<Timestamp*>(current->data());
|
||||
if (!current->data().empty()) {
|
||||
ts = reinterpret_cast<Timestamp*>(current->data().back());
|
||||
ts->getTime(&startTimeStamp, &endTimeStamp);
|
||||
break;
|
||||
}
|
||||
@@ -1564,13 +1558,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
|
||||
current = list;
|
||||
while (current != nullptr) {
|
||||
if (current->profilingInfo().enabled_) {
|
||||
if (current->data() != nullptr) {
|
||||
// Since this is a valid command to get a timestamp, we use the
|
||||
// timestamp provided by the runtime (saved in the data())
|
||||
ts = reinterpret_cast<Timestamp*>(current->data());
|
||||
ts->getTime(&startTimeStamp, &endTimeStamp);
|
||||
ts->release();
|
||||
current->setData(nullptr);
|
||||
if (!current->data().empty()) {
|
||||
for (auto i = 0; i < current->data().size(); i++) {
|
||||
// Since this is a valid command to get a timestamp, we use the
|
||||
// timestamp provided by the runtime (saved in the data())
|
||||
ts = reinterpret_cast<Timestamp*>(current->data()[i]);
|
||||
ts->getTime(&startTimeStamp, &endTimeStamp);
|
||||
ts->release();
|
||||
}
|
||||
current->data().clear();
|
||||
} else {
|
||||
// If we don't have a command that contains a valid timestamp,
|
||||
// we simply use the end timestamp of the previous command.
|
||||
@@ -3467,7 +3463,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(vcmd, true, true);
|
||||
profilingBegin(vcmd, true);
|
||||
|
||||
uint8_t* aqlPacket = vcmd.getLastPacket();
|
||||
if (aqlPacket != nullptr) {
|
||||
@@ -3489,7 +3485,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
|
||||
}
|
||||
}
|
||||
|
||||
profilingEnd(vcmd, true);
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool create();
|
||||
const Device& dev() const { return roc_device_; }
|
||||
|
||||
void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
|
||||
void profilingEnd(amd::Command& command, bool useCommandTs = false);
|
||||
void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
|
||||
void profilingEnd(amd::Command& command);
|
||||
|
||||
void updateCommandsState(amd::Command* list) const;
|
||||
|
||||
|
||||
@@ -317,7 +317,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
|
||||
queue_(&queue),
|
||||
next_(nullptr),
|
||||
type_(type),
|
||||
data_(nullptr),
|
||||
waitingEvent_(waitingEvent),
|
||||
eventWaitList_(eventWaitList),
|
||||
commandWaitBits_(commandWaitBits) {
|
||||
|
||||
@@ -258,7 +258,7 @@ class Command : public Event {
|
||||
Command* next_; //!< Next GPU command in the queue list
|
||||
Command* batch_head_ = nullptr; //!< The head of the batch commands
|
||||
cl_command_type type_; //!< This command's OpenCL type.
|
||||
void* data_;
|
||||
std::vector<void*> data_;
|
||||
const Event* waitingEvent_; //!< Waiting event associated with the marker
|
||||
|
||||
protected:
|
||||
@@ -282,7 +282,6 @@ class Command : public Event {
|
||||
queue_(nullptr),
|
||||
next_(nullptr),
|
||||
type_(type),
|
||||
data_(nullptr),
|
||||
waitingEvent_(nullptr),
|
||||
eventWaitList_(nullWaitList),
|
||||
commandWaitBits_(0) {}
|
||||
@@ -322,11 +321,9 @@ class Command : public Event {
|
||||
//! Return this command's OpenCL type.
|
||||
cl_command_type type() const { return type_; }
|
||||
|
||||
//! Return the opaque, device specific data for this command.
|
||||
void* data() const { return data_; }
|
||||
//! Return the opaque, device specific data vector for this command.
|
||||
std::vector<void*>& data() { return data_; }
|
||||
|
||||
//! Set the opaque, device specific data for this command.
|
||||
void setData(void* data) { data_ = data; }
|
||||
|
||||
/*! \brief The execution engine for this command.
|
||||
*
|
||||
@@ -1273,17 +1270,13 @@ class AccumulateCommand : public Command {
|
||||
|
||||
//! Add kernel name to the list if available
|
||||
void addKernelName(const std::string& kernelName) {
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
// "^" is to indicate kernel is captured at instantiate
|
||||
kernelNames_.push_back("^ " + kernelName);
|
||||
}
|
||||
// "^" is to indicate kernel is captured at instantiate
|
||||
kernelNames_.push_back("^ " + kernelName);
|
||||
}
|
||||
|
||||
//! Add kernel timestamp to the list if available
|
||||
void addTimestamps(uint64_t startTs, uint64_t endTs) {
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
tsList_.push_back(std::make_pair(startTs, endTs));
|
||||
}
|
||||
tsList_.push_back(std::make_pair(startTs, endTs));
|
||||
}
|
||||
|
||||
//! Return the kernel names
|
||||
|
||||
@@ -51,7 +51,8 @@
|
||||
bool amd::ClGlEvent::waitForFence() {
|
||||
GLenum ret;
|
||||
// get fence id associated with fence event
|
||||
GLsync gs = reinterpret_cast<GLsync>(command().data());
|
||||
GLsync gs = !command().data().empty() ? reinterpret_cast<GLsync>(command().data().back())
|
||||
: nullptr;
|
||||
if (!gs) return false;
|
||||
|
||||
// Try to use DC and GLRC of current thread, if it doesn't exist
|
||||
|
||||
@@ -219,7 +219,7 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024, \
|
||||
"The minimum size in KB for P2P transfer with SDMA") \
|
||||
release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
|
||||
"AQL queue size in AQL packets") \
|
||||
release(uint, ROC_SIGNAL_POOL_SIZE, 4096, \
|
||||
release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
|
||||
"Initial size of HSA signal pool") \
|
||||
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
|
||||
"Limit the number of workgroups in blit operations") \
|
||||
|
||||
Ссылка в новой задаче
Block a user