clr: Optimize doorbell ring (#1030)
*Lay foundation to batch packets efficiently for graphs *Dynamically copy packets with max threshold set with DEBUG_HIP_GRAPH_BATCH_SIZE, if not stagger packet copy with pow2 *Default threshold for DEBUG_HIP_GRAPH_BATCH_SIZE is 256 *If TS are not collected for a signal for reuse, create a new signal. This can potentially increase signal footprint if the handler doesn't run fast enough.
Этот коммит содержится в:
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -394,9 +394,20 @@ void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t GraphExec::AllocKernelArgForGraphNode() {
|
||||
hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
|
||||
hipError_t status = hipSuccess;
|
||||
for (auto& node : topoOrder_) {
|
||||
|
||||
// Clear previous capture status and batches
|
||||
nodeCaptureStatus_.clear();
|
||||
nodeCaptureStatus_.resize(topoOrder_.size(), false);
|
||||
|
||||
// Clear previous batches
|
||||
packetBatches_.clear();
|
||||
|
||||
// Process nodes and create batches of consecutive captured nodes
|
||||
for (size_t i = 0; i < topoOrder_.size(); ++i) {
|
||||
auto& node = topoOrder_[i];
|
||||
|
||||
if (node->GetType() == hipGraphNodeTypeKernel) {
|
||||
// Check if graph requires hidden heap and set as part of graphExec param.
|
||||
static bool initialized = false;
|
||||
@@ -405,19 +416,51 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() {
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (node->GraphCaptureEnabled()) {
|
||||
status = node->CaptureAndFormPacket(GetKernelArgManager());
|
||||
// Start of a potential batch - try to capture packets for this node
|
||||
std::vector<uint8_t*> currentBatch;
|
||||
std::vector<std::string> currentKernelNames;
|
||||
|
||||
// Collect packets from consecutive captured nodes
|
||||
size_t j = i;
|
||||
size_t capturedNodeCount = 0;
|
||||
while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) {
|
||||
auto& currentNode = topoOrder_[j];
|
||||
status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), ¤tBatch,
|
||||
¤tKernelNames);
|
||||
|
||||
if (status != hipSuccess || currentBatch.empty()) {
|
||||
LogError("Packet capture failed");
|
||||
return status;
|
||||
}
|
||||
// Mark this node as successfully captured
|
||||
nodeCaptureStatus_[j] = true;
|
||||
++j;
|
||||
++capturedNodeCount;
|
||||
}
|
||||
|
||||
// Add the batch if it has packets
|
||||
if (!currentBatch.empty()) {
|
||||
packetBatches_.emplace_back(std::move(currentBatch), std::move(currentKernelNames),
|
||||
capturedNodeCount);
|
||||
}
|
||||
|
||||
// Skip the nodes we just processed, the index will be incremented by the loop
|
||||
i = j - 1;
|
||||
} else if (node->GetType() == hipGraphNodeTypeGraph) {
|
||||
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
|
||||
if (childNode->GetChildGraph()->max_streams_ == 1) {
|
||||
childNode->SetGraphCaptureStatus(true);
|
||||
status = childNode->AllocKernelArgForGraphNode();
|
||||
status = childNode->CaptureAndFormPacketsForGraph();
|
||||
nodeCaptureStatus_[i] = (status == hipSuccess);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
status = hipSuccess; // Continue with other nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -427,8 +470,8 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
size_t kernArgSizeForGraph = 0;
|
||||
GetKernelArgSizeForGraph(kernArgSizeForGraph);
|
||||
// When we support multi device graph lauch we need to allocate the kenel args on respective
|
||||
// device for each kernel Assume graph has nodes of same device allocate kernel args on the device
|
||||
// from the first node
|
||||
// device for each kernel Assume graph has nodes of same device allocate kernel args on the
|
||||
// device from the first node
|
||||
auto device = g_devices[topoOrder_[0]->GetDeviceId()]->devices()[0];
|
||||
// Add a larger initial pool to accomodate for any updates to kernel args
|
||||
bool bStatus =
|
||||
@@ -437,10 +480,11 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
return hipErrorMemoryAllocation;
|
||||
}
|
||||
|
||||
status = AllocKernelArgForGraphNode();
|
||||
status = CaptureAndFormPacketsForGraph();
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
kernArgManager_->ReadBackOrFlush();
|
||||
return status;
|
||||
}
|
||||
@@ -471,18 +515,37 @@ hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
|
||||
}
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
if (topoOrder_[i]->GraphCaptureEnabled()) {
|
||||
if (topoOrder_[i]->GetEnabled()) {
|
||||
std::vector<uint8_t*>& gpuPackets = topoOrder_[i]->GetAqlPackets();
|
||||
for (auto& packet : gpuPackets) {
|
||||
hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate);
|
||||
|
||||
size_t batchIndex = 0;
|
||||
|
||||
// Process nodes in topological order with mixed execution strategy
|
||||
for (size_t i = 0; i < topoOrder_.size(); ++i) {
|
||||
auto& node = topoOrder_[i];
|
||||
|
||||
if (!node->GraphCaptureEnabled()) {
|
||||
// Node doesn't support capture - execute individually
|
||||
node->SetStream(hip_stream);
|
||||
status = node->CreateCommand(node->GetQueue());
|
||||
node->EnqueueCommands(hip_stream);
|
||||
} else if (i < nodeCaptureStatus_.size() && nodeCaptureStatus_[i]) {
|
||||
// Node was successfully captured - find which batch it belongs to
|
||||
// and dispatch the entire batch
|
||||
if (batchIndex < packetBatches_.size()) {
|
||||
// Dispatch this batch
|
||||
bool batchStatus = hip_stream->vdev()->dispatchAqlPacketBatch(
|
||||
packetBatches_[batchIndex].packets, packetBatches_[batchIndex].kernelNames, accumulate);
|
||||
if (!batchStatus) {
|
||||
status = hipErrorUnknown;
|
||||
accumulate->release();
|
||||
return status;
|
||||
}
|
||||
|
||||
// Skip all consecutive captured nodes that belong to this batch
|
||||
// Use the tracked node count to skip directly instead of parsing one by one
|
||||
i += packetBatches_[batchIndex].capturedNodeCount - 1; // -1 because loop will increment
|
||||
|
||||
++batchIndex;
|
||||
}
|
||||
} else {
|
||||
topoOrder_[i]->SetStream(hip_stream);
|
||||
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
topoOrder_[i]->EnqueueCommands(hip_stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -245,15 +245,22 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
size_t GetKerArgSize() const { return alignedKernArgSize_; }
|
||||
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
|
||||
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
|
||||
hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr) {
|
||||
|
||||
//! Capture packets and accumulate them into a batch if provided
|
||||
hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr,
|
||||
std::vector<uint8_t*>* batchPackets = nullptr,
|
||||
std::vector<std::string>* batchKernelNames = nullptr) {
|
||||
auto capture_stream = hip::getNullStream(g_devices[dev_id_]->devices()[0]->context(), false);
|
||||
hipError_t status = CreateCommand(capture_stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Release last created packet memory before they are overwritten with new packets
|
||||
std::for_each(gpuPackets_.begin(), gpuPackets_.end(), [](auto p) { delete[] p; });
|
||||
// Clear the pointer array
|
||||
gpuPackets_.clear();
|
||||
|
||||
for (auto& command : commands_) {
|
||||
command->setPktCapturingState(true, &gpuPackets_, kernArgMgr, &capturedKernelName_);
|
||||
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
|
||||
@@ -261,6 +268,15 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
command->submit(*(command->queue())->vdev());
|
||||
command->release();
|
||||
}
|
||||
|
||||
// Accumulate packets directly into the batch (only if batch vectors are provided)
|
||||
if (batchPackets != nullptr && batchKernelNames != nullptr) {
|
||||
for (auto& packet : gpuPackets_) {
|
||||
batchPackets->push_back(packet);
|
||||
batchKernelNames->push_back(capturedKernelName_);
|
||||
}
|
||||
}
|
||||
|
||||
// Commands are captured and released. Clear them from the object.
|
||||
commands_.clear();
|
||||
|
||||
@@ -814,6 +830,9 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
if (instantiateDeviceId_ != -1) {
|
||||
static_cast<ReferenceCountedObject*>(g_devices[instantiateDeviceId_])->release();
|
||||
}
|
||||
|
||||
packetBatches_.clear();
|
||||
nodeCaptureStatus_.clear();
|
||||
}
|
||||
|
||||
Node GetClonedNode(Node node) {
|
||||
@@ -849,7 +868,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
}
|
||||
GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; }
|
||||
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
|
||||
hipError_t AllocKernelArgForGraphNode();
|
||||
hipError_t CaptureAndFormPacketsForGraph();
|
||||
void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
|
||||
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
|
||||
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
|
||||
@@ -863,6 +882,23 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
int instantiateDeviceId_ = -1;
|
||||
bool hasHiddenHeap_ = false; //!< Hidden heap indicator for Kernel node
|
||||
bool repeatLaunch_ = false;
|
||||
|
||||
//! Structure for batch dispatch optimization - packets and kernel names in aligned memory
|
||||
struct PacketBatch {
|
||||
std::vector<uint8_t*> packets;
|
||||
std::vector<std::string> kernelNames;
|
||||
size_t capturedNodeCount; // Number of consecutive captured nodes in this batch
|
||||
|
||||
PacketBatch() : capturedNodeCount(0) {}
|
||||
PacketBatch(std::vector<uint8_t*>&& p, std::vector<std::string>&& k, size_t nodeCount)
|
||||
: packets(std::move(p)), kernelNames(std::move(k)), capturedNodeCount(nodeCount) {}
|
||||
};
|
||||
|
||||
//! Batches of accumulated packets and kernel names for batch dispatch optimization
|
||||
//! Each batch contains packets from consecutive captured nodes
|
||||
std::vector<PacketBatch> packetBatches_;
|
||||
//! Track which nodes were successfully captured (true) vs need individual execution (false)
|
||||
std::vector<bool> nodeCaptureStatus_;
|
||||
};
|
||||
|
||||
class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -1351,10 +1351,11 @@ class VirtualDevice : public amd::HeapObject {
|
||||
virtual bool isFenceDirty() const = 0;
|
||||
//! Init hidden heap for device memory allocations
|
||||
virtual void HiddenHeapInit() = 0;
|
||||
//! Dispatch captured AQL packet
|
||||
virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
|
||||
amd::AccumulateCommand* vcmd = nullptr) = 0;
|
||||
|
||||
//! Dispatches multiple AQL packets in a single batch operation
|
||||
virtual bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
|
||||
const std::vector<std::string>& kernelNames,
|
||||
amd::AccumulateCommand* vcmd = nullptr) = 0 ;
|
||||
//! Returns the number of outstanding HSA async handlers
|
||||
std::atomic<uint64_t>& QueuedAsyncHandlers() const { return queued_async_handlers_; }
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -355,9 +355,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void HiddenHeapInit() {}
|
||||
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
|
||||
amd::AccumulateCommand* vcmd = nullptr) {
|
||||
vcmd->addKernelName(kernelName);
|
||||
//! Dispatches multiple AQL packets in a single batch operation
|
||||
bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
|
||||
const std::vector<std::string>& kernelNames,
|
||||
amd::AccumulateCommand* vcmd = nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2013 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -436,8 +436,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
|
||||
|
||||
// Peep signal +2 ahead to see if its done
|
||||
auto temp_id = (current_id_ + 2) % signal_list_.size();
|
||||
// If GPU is still busy with processing, then add more signals to avoid more frequent stalls
|
||||
if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
|
||||
// If GPU is still busy with processing or if timestamps havent been saved out,
|
||||
// then add more signals to avoid more frequent stalls
|
||||
if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0 ||
|
||||
!signal_list_[temp_id]->flags_.done_) {
|
||||
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
|
||||
if ((signal != nullptr) && CreateSignal(signal.get())) {
|
||||
// Find valid new index
|
||||
@@ -453,10 +455,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
|
||||
if (!new_signal) {
|
||||
// Find valid index
|
||||
++current_id_ %= signal_list_.size();
|
||||
|
||||
// Make sure the previous operation on the current signal is done
|
||||
WaitCurrent();
|
||||
|
||||
size_t next = (current_id_ + 1) % signal_list_.size();
|
||||
// Have to wait the next signal in the queue to avoid a race condition between
|
||||
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
|
||||
WaitNext();
|
||||
@@ -1009,7 +1011,6 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
|
||||
// Check for queue full and wait if needed.
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
fence_dirty_ = true;
|
||||
|
||||
if (addSystemScope_) {
|
||||
@@ -1055,16 +1056,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
current_signal->flags_.isPacketDispatch_ = true;
|
||||
}
|
||||
|
||||
|
||||
// Make sure the slot is free for usage
|
||||
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
|
||||
amd::Os::yield();
|
||||
// Active spin - no yield
|
||||
}
|
||||
|
||||
// Add blocking command if the original value of read index was behind of the queue size.
|
||||
// Note: direct dispatch relies on the slot stall above to keep the forward progress
|
||||
// of the app if a dispatched kernel requires some CPU input for completion
|
||||
if (blocking || (!AMD_DIRECT_DISPATCH && (index - read) >= sw_queue_size)) {
|
||||
if (blocking || (!AMD_DIRECT_DISPATCH &&
|
||||
(index - hsa_queue_load_read_index_relaxed(gpu_queue_)) >= sw_queue_size)) {
|
||||
if (packet->completion_signal.handle == 0) {
|
||||
packet->completion_signal = Barriers().ActiveSignal();
|
||||
}
|
||||
@@ -1102,7 +1103,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2, read, index);
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
|
||||
hsa_queue_load_read_index_scacquire(gpu_queue_), index);
|
||||
|
||||
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
|
||||
|
||||
@@ -1158,34 +1160,229 @@ bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t he
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
|
||||
amd::AccumulateCommand* vcmd) {
|
||||
if (vcmd == nullptr) {
|
||||
template <typename AqlPacket>
|
||||
bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
|
||||
bool blocking, bool attach_signal,
|
||||
const std::vector<std::string>* kernelNames) {
|
||||
if (packets.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vcmd->addKernelName(kernelName);
|
||||
amd::ScopedLock lock(execution());
|
||||
const uint32_t queueSize = gpu_queue_->size;
|
||||
const uint32_t queueMask = queueSize - 1;
|
||||
const uint32_t sw_queue_size = queueMask;
|
||||
const size_t numPackets = packets.size();
|
||||
size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
|
||||
const size_t kGpuLagPackets = 16;
|
||||
|
||||
// Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE
|
||||
size_t processedPackets = 0;
|
||||
size_t batchSize = 1;
|
||||
|
||||
while (processedPackets < numPackets) {
|
||||
uint64_t currentReadIndex = hsa_queue_load_read_index_scacquire(gpu_queue_);
|
||||
uint64_t currentWriteIndex = hsa_queue_load_write_index_relaxed(gpu_queue_);
|
||||
|
||||
if (currentWriteIndex - currentReadIndex >= kGpuLagPackets) {
|
||||
//GPU is busy, so we can copy more packets
|
||||
batchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
|
||||
}
|
||||
|
||||
// Process all remaining packets in one batch
|
||||
if (processedPackets + batchSize > numPackets) {
|
||||
batchSize = numPackets - processedPackets;
|
||||
}
|
||||
|
||||
// Check if we have enough space in the queue for this batch
|
||||
// If queue is full, reset batch size to 1 and wait
|
||||
if (currentWriteIndex + batchSize - currentReadIndex >= sw_queue_size) {
|
||||
batchSize = 1;
|
||||
}
|
||||
|
||||
// Now reserve space for the batch
|
||||
uint64_t startIndex = hsa_queue_add_write_index_screlease(gpu_queue_, batchSize);
|
||||
|
||||
// Make sure the slot is free for usage
|
||||
while ((startIndex - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
|
||||
// Active spin - no yield
|
||||
}
|
||||
|
||||
fence_dirty_ = true;
|
||||
|
||||
// Save header of first packet in this batch
|
||||
AqlPacket* firstPacket = packets[processedPackets];
|
||||
uint16_t firstPacketHeader = firstPacket->header;
|
||||
uint16_t firstPacketRest = firstPacket->setup;
|
||||
|
||||
// Process batchSize packets
|
||||
for (size_t i = 0; i < batchSize; ++i) {
|
||||
size_t packetIndex = processedPackets + i;
|
||||
uint64_t index = startIndex + i;
|
||||
|
||||
AqlPacket* packet = packets[packetIndex];
|
||||
uint16_t header = packet->header;
|
||||
|
||||
|
||||
bool attachSignal = timestamp_ != nullptr || attach_signal;
|
||||
|
||||
packet->completion_signal =
|
||||
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal);
|
||||
|
||||
if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value &&
|
||||
timestamp_ != nullptr) {
|
||||
// If profiling is enabled, store the correlation ID in the dispatch packet
|
||||
if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
|
||||
dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
|
||||
}
|
||||
|
||||
ProfilingSignal* current_signal = Barriers().GetLastSignal();
|
||||
current_signal->flags_.isPacketDispatch_ = true;
|
||||
}
|
||||
|
||||
//Add blocking command if needed (only for the last packet)
|
||||
if (blocking && (packetIndex == numPackets - 1)) {
|
||||
if (packet->completion_signal.handle == 0) {
|
||||
packet->completion_signal = Barriers().ActiveSignal();
|
||||
}
|
||||
}
|
||||
|
||||
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
|
||||
|
||||
// For first packet in batch, invalidate header before writing
|
||||
if (i == 0) {
|
||||
if (addSystemScope_) {
|
||||
// Add system scope on the acq on first packet
|
||||
firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
}
|
||||
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
|
||||
|
||||
// Copy the packet and then write the valid of the first packet
|
||||
*aql_loc = *packet;
|
||||
|
||||
// Restore the header of the first packet
|
||||
packet->header = firstPacketHeader;
|
||||
} else {
|
||||
// For the end packet in batch set flags
|
||||
if (i == batchSize - 1) {
|
||||
if (addSystemScope_) {
|
||||
// Add system scope on the release on last packet
|
||||
packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
addSystemScope_ = false;
|
||||
}
|
||||
auto expected_fence_state =
|
||||
extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
// Reset fence_dirty_ flag if we submit a packet with system scopes
|
||||
if (expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
}
|
||||
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
|
||||
}
|
||||
|
||||
// Copy the packet to the queue
|
||||
*aql_loc = *packet;
|
||||
}
|
||||
|
||||
// Print kernel name for kernel dispatch packets
|
||||
if (kernelNames && packetIndex < kernelNames->size()) {
|
||||
uint8_t packetType =
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
|
||||
if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s",
|
||||
(*kernelNames)[packetIndex].c_str());
|
||||
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
|
||||
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], "
|
||||
"private_seg_size=%zu, group_seg_size=%zu, kernel_obj=0x%zx, "
|
||||
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
|
||||
"rptr=%u, wptr=%u",
|
||||
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER,
|
||||
HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
packet->setup,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
|
||||
hsa_queue_load_read_index_scacquire(gpu_queue_), index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write valid header for the first packet in the batch
|
||||
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
|
||||
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
|
||||
|
||||
// Ring doorbell for this batch
|
||||
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
|
||||
|
||||
processedPackets += batchSize;
|
||||
|
||||
TrackQueueProgress(*packets[processedPackets - 1], startIndex + batchSize - 1);
|
||||
// Double the batch size for next iteration, cap at DEBUG_HIP_GRAPH_BATCH_SIZE
|
||||
if (batchSize < kMaxBatchSize) {
|
||||
batchSize *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Mark the flag indicating if a dispatch is outstanding
|
||||
hasPendingDispatch_ = true;
|
||||
|
||||
// Wait on signal for the last packet if blocking
|
||||
if (blocking) {
|
||||
LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
|
||||
if (!Barriers().WaitCurrent()) {
|
||||
LogPrintfError("Failed blocking queue wait with signal [0x%lx]",
|
||||
packets.back()->completion_signal.handle);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
|
||||
const std::vector<std::string>& kernelNames,
|
||||
amd::AccumulateCommand* vcmd) {
|
||||
if (vcmd == nullptr || packets.empty() || packets.size() != kernelNames.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(*vcmd);
|
||||
|
||||
dispatchBlockingWait();
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Graph shader name : %s", kernelName.c_str());
|
||||
|
||||
// The Aqlpacket with valid header will trigger the issue that AQL fill
|
||||
// the header before filling the body. However, the CP can handle the AQL package
|
||||
// after seeing the valid AQL header with the AQL package's body is NULL.
|
||||
// This patch fixes this potential issue that filling AQL header before
|
||||
// filling the AQL body.
|
||||
uint16_t packetHeader = packet->header;
|
||||
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
|
||||
dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
|
||||
packet->header = packetHeader;
|
||||
// Add all kernel names in bulk
|
||||
vcmd->addKernelNames(kernelNames);
|
||||
|
||||
// Dispatch all packets with a single doorbell ring
|
||||
// Cast packets vector to AQL packets vector on the fly
|
||||
const auto& aqlPackets =
|
||||
reinterpret_cast<const std::vector<hsa_kernel_dispatch_packet_t*>&>(packets);
|
||||
bool result = dispatchGenericAqlPacketBatch(aqlPackets, false, false, &kernelNames);
|
||||
|
||||
profilingEnd();
|
||||
|
||||
return true;
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -468,16 +468,23 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
void dispatchBlockingWait();
|
||||
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
|
||||
amd::AccumulateCommand* vcmd = nullptr);
|
||||
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking = true, bool capturing = false,
|
||||
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
|
||||
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking = true, bool attach_signal = false);
|
||||
|
||||
//! Dispatches multiple AQL packets in a single batch operation
|
||||
bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
|
||||
const std::vector<std::string>& kernelNames,
|
||||
amd::AccumulateCommand* vcmd = nullptr);
|
||||
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking,
|
||||
bool attach_signal = false);
|
||||
//! Dispatches multiple AQL packets with a single doorbell ring
|
||||
template <typename AqlPacket> bool dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
|
||||
bool blocking, bool attach_signal = false,
|
||||
const std::vector<std::string>* kernelNames = nullptr);
|
||||
|
||||
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
|
||||
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2010 - 2024 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2010 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -1394,6 +1394,11 @@ class AccumulateCommand : public Command {
|
||||
//! Add kernel name to the list if available
|
||||
void addKernelName(const std::string& kernelName) { kernelNames_.push_back(kernelName); }
|
||||
|
||||
//! Add multiple kernel names in bulk
|
||||
void addKernelNames(const std::vector<std::string>& kernelNames) {
|
||||
kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
|
||||
}
|
||||
|
||||
//! Add kernel timestamp to the list if available
|
||||
void addTimestamps(uint64_t startTs, uint64_t endTs) {
|
||||
tsList_.push_back(std::make_pair(startTs, endTs));
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2009 - 2021 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -255,6 +255,8 @@ release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
|
||||
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
|
||||
release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
|
||||
"Forces the number of streams for the graph parallel execution") \
|
||||
release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256, \
|
||||
"Number of graph nodes to batch at a time") \
|
||||
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
|
||||
"Blocks synchronization on CPU until the callback processing is done")\
|
||||
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \
|
||||
|
||||
Ссылка в новой задаче
Block a user