*Lay foundation to batch packets efficiently for graphs
*Dynamically copy packets with max threshold set with
DEBUG_HIP_GRAPH_BATCH_SIZE, if not stagger packet copy with pow2
*Default threshold for DEBUG_HIP_GRAPH_BATCH_SIZE is 256
*If TS are not collected for a signal for reuse, create a new signal.
This can potentially increase signal footprint if the handler doesn't run
fast enough.
Этот коммит содержится в:
SaleelK
2025-09-18 15:02:10 -07:00
коммит произвёл GitHub
родитель 0cf70e85cf
Коммит 149dc17c90
8 изменённых файлов: 373 добавлений и 61 удалений
+82 -19
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc.
/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -394,9 +394,20 @@ void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) {
}
// ================================================================================================
hipError_t GraphExec::AllocKernelArgForGraphNode() {
hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
hipError_t status = hipSuccess;
for (auto& node : topoOrder_) {
// Clear previous capture status and batches
nodeCaptureStatus_.clear();
nodeCaptureStatus_.resize(topoOrder_.size(), false);
// Clear previous batches
packetBatches_.clear();
// Process nodes and create batches of consecutive captured nodes
for (size_t i = 0; i < topoOrder_.size(); ++i) {
auto& node = topoOrder_[i];
if (node->GetType() == hipGraphNodeTypeKernel) {
// Check if graph requires hidden heap and set as part of graphExec param.
static bool initialized = false;
@@ -405,19 +416,51 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() {
initialized = true;
}
}
if (node->GraphCaptureEnabled()) {
status = node->CaptureAndFormPacket(GetKernelArgManager());
// Start of a potential batch - try to capture packets for this node
std::vector<uint8_t*> currentBatch;
std::vector<std::string> currentKernelNames;
// Collect packets from consecutive captured nodes
size_t j = i;
size_t capturedNodeCount = 0;
while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) {
auto& currentNode = topoOrder_[j];
status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), &currentBatch,
&currentKernelNames);
if (status != hipSuccess || currentBatch.empty()) {
LogError("Packet capture failed");
return status;
}
// Mark this node as successfully captured
nodeCaptureStatus_[j] = true;
++j;
++capturedNodeCount;
}
// Add the batch if it has packets
if (!currentBatch.empty()) {
packetBatches_.emplace_back(std::move(currentBatch), std::move(currentKernelNames),
capturedNodeCount);
}
// Skip the nodes we just processed, the index will be incremented by the loop
i = j - 1;
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
if (childNode->GetChildGraph()->max_streams_ == 1) {
childNode->SetGraphCaptureStatus(true);
status = childNode->AllocKernelArgForGraphNode();
status = childNode->CaptureAndFormPacketsForGraph();
nodeCaptureStatus_[i] = (status == hipSuccess);
if (status != hipSuccess) {
return status;
status = hipSuccess; // Continue with other nodes
}
}
}
}
return status;
}
@@ -427,8 +470,8 @@ hipError_t GraphExec::CaptureAQLPackets() {
size_t kernArgSizeForGraph = 0;
GetKernelArgSizeForGraph(kernArgSizeForGraph);
// When we support multi device graph lauch we need to allocate the kenel args on respective
// device for each kernel Assume graph has nodes of same device allocate kernel args on the device
// from the first node
// device for each kernel Assume graph has nodes of same device allocate kernel args on the
// device from the first node
auto device = g_devices[topoOrder_[0]->GetDeviceId()]->devices()[0];
// Add a larger initial pool to accomodate for any updates to kernel args
bool bStatus =
@@ -437,10 +480,11 @@ hipError_t GraphExec::CaptureAQLPackets() {
return hipErrorMemoryAllocation;
}
status = AllocKernelArgForGraphNode();
status = CaptureAndFormPacketsForGraph();
if (status != hipSuccess) {
return status;
}
kernArgManager_->ReadBackOrFlush();
return status;
}
@@ -471,18 +515,37 @@ hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
}
for (int i = 0; i < topoOrder_.size(); i++) {
if (topoOrder_[i]->GraphCaptureEnabled()) {
if (topoOrder_[i]->GetEnabled()) {
std::vector<uint8_t*>& gpuPackets = topoOrder_[i]->GetAqlPackets();
for (auto& packet : gpuPackets) {
hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate);
size_t batchIndex = 0;
// Process nodes in topological order with mixed execution strategy
for (size_t i = 0; i < topoOrder_.size(); ++i) {
auto& node = topoOrder_[i];
if (!node->GraphCaptureEnabled()) {
// Node doesn't support capture - execute individually
node->SetStream(hip_stream);
status = node->CreateCommand(node->GetQueue());
node->EnqueueCommands(hip_stream);
} else if (i < nodeCaptureStatus_.size() && nodeCaptureStatus_[i]) {
// Node was successfully captured - find which batch it belongs to
// and dispatch the entire batch
if (batchIndex < packetBatches_.size()) {
// Dispatch this batch
bool batchStatus = hip_stream->vdev()->dispatchAqlPacketBatch(
packetBatches_[batchIndex].packets, packetBatches_[batchIndex].kernelNames, accumulate);
if (!batchStatus) {
status = hipErrorUnknown;
accumulate->release();
return status;
}
// Skip all consecutive captured nodes that belong to this batch
// Use the tracked node count to skip directly instead of parsing one by one
i += packetBatches_[batchIndex].capturedNodeCount - 1; // -1 because loop will increment
++batchIndex;
}
} else {
topoOrder_[i]->SetStream(hip_stream);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
topoOrder_[i]->EnqueueCommands(hip_stream);
}
}
+39 -3
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc.
/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -245,15 +245,22 @@ class GraphNode : public hipGraphNodeDOTAttribute {
size_t GetKerArgSize() const { return alignedKernArgSize_; }
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr) {
//! Capture packets and accumulate them into a batch if provided
hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr,
std::vector<uint8_t*>* batchPackets = nullptr,
std::vector<std::string>* batchKernelNames = nullptr) {
auto capture_stream = hip::getNullStream(g_devices[dev_id_]->devices()[0]->context(), false);
hipError_t status = CreateCommand(capture_stream);
if (status != hipSuccess) {
return status;
}
// Release last created packet memory before they are overwritten with new packets
std::for_each(gpuPackets_.begin(), gpuPackets_.end(), [](auto p) { delete[] p; });
// Clear the pointer array
gpuPackets_.clear();
for (auto& command : commands_) {
command->setPktCapturingState(true, &gpuPackets_, kernArgMgr, &capturedKernelName_);
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
@@ -261,6 +268,15 @@ class GraphNode : public hipGraphNodeDOTAttribute {
command->submit(*(command->queue())->vdev());
command->release();
}
// Accumulate packets directly into the batch (only if batch vectors are provided)
if (batchPackets != nullptr && batchKernelNames != nullptr) {
for (auto& packet : gpuPackets_) {
batchPackets->push_back(packet);
batchKernelNames->push_back(capturedKernelName_);
}
}
// Commands are captured and released. Clear them from the object.
commands_.clear();
@@ -814,6 +830,9 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
if (instantiateDeviceId_ != -1) {
static_cast<ReferenceCountedObject*>(g_devices[instantiateDeviceId_])->release();
}
packetBatches_.clear();
nodeCaptureStatus_.clear();
}
Node GetClonedNode(Node node) {
@@ -849,7 +868,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
}
GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; }
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
hipError_t AllocKernelArgForGraphNode();
hipError_t CaptureAndFormPacketsForGraph();
void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
@@ -863,6 +882,23 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
int instantiateDeviceId_ = -1;
bool hasHiddenHeap_ = false; //!< Hidden heap indicator for Kernel node
bool repeatLaunch_ = false;
//! Structure for batch dispatch optimization - packets and kernel names in aligned memory
struct PacketBatch {
std::vector<uint8_t*> packets;
std::vector<std::string> kernelNames;
size_t capturedNodeCount; // Number of consecutive captured nodes in this batch
PacketBatch() : capturedNodeCount(0) {}
PacketBatch(std::vector<uint8_t*>&& p, std::vector<std::string>&& k, size_t nodeCount)
: packets(std::move(p)), kernelNames(std::move(k)), capturedNodeCount(nodeCount) {}
};
//! Batches of accumulated packets and kernel names for batch dispatch optimization
//! Each batch contains packets from consecutive captured nodes
std::vector<PacketBatch> packetBatches_;
//! Track which nodes were successfully captured (true) vs need individual execution (false)
std::vector<bool> nodeCaptureStatus_;
};
class ChildGraphNode : public GraphNode, public GraphExec {
+5 -4
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -1351,10 +1351,11 @@ class VirtualDevice : public amd::HeapObject {
virtual bool isFenceDirty() const = 0;
//! Init hidden heap for device memory allocations
virtual void HiddenHeapInit() = 0;
//! Dispatch captured AQL packet
virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd = nullptr) = 0;
//! Dispatches multiple AQL packets in a single batch operation
virtual bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
const std::vector<std::string>& kernelNames,
amd::AccumulateCommand* vcmd = nullptr) = 0 ;
//! Returns the number of outstanding HSA async handlers
std::atomic<uint64_t>& QueuedAsyncHandlers() const { return queued_async_handlers_; }
+5 -4
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
/* Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -355,9 +355,10 @@ class VirtualGPU : public device::VirtualDevice {
void HiddenHeapInit() {}
inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd = nullptr) {
vcmd->addKernelName(kernelName);
//! Dispatches multiple AQL packets in a single batch operation
bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
const std::vector<std::string>& kernelNames,
amd::AccumulateCommand* vcmd = nullptr) {
return false;
}
+223 -26
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
/* Copyright (c) 2013 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -436,8 +436,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
// Peep signal +2 ahead to see if its done
auto temp_id = (current_id_ + 2) % signal_list_.size();
// If GPU is still busy with processing, then add more signals to avoid more frequent stalls
if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
// If GPU is still busy with processing or if timestamps havent been saved out,
// then add more signals to avoid more frequent stalls
if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0 ||
!signal_list_[temp_id]->flags_.done_) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal != nullptr) && CreateSignal(signal.get())) {
// Find valid new index
@@ -453,10 +455,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
if (!new_signal) {
// Find valid index
++current_id_ %= signal_list_.size();
// Make sure the previous operation on the current signal is done
WaitCurrent();
size_t next = (current_id_ + 1) % signal_list_.size();
// Have to wait the next signal in the queue to avoid a race condition between
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
WaitNext();
@@ -1009,7 +1011,6 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
fence_dirty_ = true;
if (addSystemScope_) {
@@ -1055,16 +1056,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
current_signal->flags_.isPacketDispatch_ = true;
}
// Make sure the slot is free for usage
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
amd::Os::yield();
// Active spin - no yield
}
// Add blocking command if the original value of read index was behind of the queue size.
// Note: direct dispatch relies on the slot stall above to keep the forward progress
// of the app if a dispatched kernel requires some CPU input for completion
if (blocking || (!AMD_DIRECT_DISPATCH && (index - read) >= sw_queue_size)) {
if (blocking || (!AMD_DIRECT_DISPATCH &&
(index - hsa_queue_load_read_index_relaxed(gpu_queue_)) >= sw_queue_size)) {
if (packet->completion_signal.handle == 0) {
packet->completion_signal = Barriers().ActiveSignal();
}
@@ -1102,7 +1103,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2, read, index);
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
hsa_queue_load_read_index_scacquire(gpu_queue_), index);
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
@@ -1158,34 +1160,229 @@ bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t he
}
// ================================================================================================
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd) {
if (vcmd == nullptr) {
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
bool blocking, bool attach_signal,
const std::vector<std::string>* kernelNames) {
if (packets.empty()) {
return false;
}
vcmd->addKernelName(kernelName);
amd::ScopedLock lock(execution());
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
const uint32_t sw_queue_size = queueMask;
const size_t numPackets = packets.size();
size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
const size_t kGpuLagPackets = 16;
// Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE
size_t processedPackets = 0;
size_t batchSize = 1;
while (processedPackets < numPackets) {
uint64_t currentReadIndex = hsa_queue_load_read_index_scacquire(gpu_queue_);
uint64_t currentWriteIndex = hsa_queue_load_write_index_relaxed(gpu_queue_);
if (currentWriteIndex - currentReadIndex >= kGpuLagPackets) {
//GPU is busy, so we can copy more packets
batchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
}
// Process all remaining packets in one batch
if (processedPackets + batchSize > numPackets) {
batchSize = numPackets - processedPackets;
}
// Check if we have enough space in the queue for this batch
// If queue is full, reset batch size to 1 and wait
if (currentWriteIndex + batchSize - currentReadIndex >= sw_queue_size) {
batchSize = 1;
}
// Now reserve space for the batch
uint64_t startIndex = hsa_queue_add_write_index_screlease(gpu_queue_, batchSize);
// Make sure the slot is free for usage
while ((startIndex - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
// Active spin - no yield
}
fence_dirty_ = true;
// Save header of first packet in this batch
AqlPacket* firstPacket = packets[processedPackets];
uint16_t firstPacketHeader = firstPacket->header;
uint16_t firstPacketRest = firstPacket->setup;
// Process batchSize packets
for (size_t i = 0; i < batchSize; ++i) {
size_t packetIndex = processedPackets + i;
uint64_t index = startIndex + i;
AqlPacket* packet = packets[packetIndex];
uint16_t header = packet->header;
bool attachSignal = timestamp_ != nullptr || attach_signal;
packet->completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal);
if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value &&
timestamp_ != nullptr) {
// If profiling is enabled, store the correlation ID in the dispatch packet
if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
}
ProfilingSignal* current_signal = Barriers().GetLastSignal();
current_signal->flags_.isPacketDispatch_ = true;
}
//Add blocking command if needed (only for the last packet)
if (blocking && (packetIndex == numPackets - 1)) {
if (packet->completion_signal.handle == 0) {
packet->completion_signal = Barriers().ActiveSignal();
}
}
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
// For first packet in batch, invalidate header before writing
if (i == 0) {
if (addSystemScope_) {
// Add system scope on the acq on first packet
firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
}
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
// Copy the packet and then write the valid of the first packet
*aql_loc = *packet;
// Restore the header of the first packet
packet->header = firstPacketHeader;
} else {
// For the end packet in batch set flags
if (i == batchSize - 1) {
if (addSystemScope_) {
// Add system scope on the release on last packet
packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
addSystemScope_ = false;
}
auto expected_fence_state =
extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
// Reset fence_dirty_ flag if we submit a packet with system scopes
if (expected_fence_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
}
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
}
// Copy the packet to the queue
*aql_loc = *packet;
}
// Print kernel name for kernel dispatch packets
if (kernelNames && packetIndex < kernelNames->size()) {
uint8_t packetType =
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s",
(*kernelNames)[packetIndex].c_str());
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], "
"private_seg_size=%zu, group_seg_size=%zu, kernel_obj=0x%zx, "
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER,
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
packet->setup,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
hsa_queue_load_read_index_scacquire(gpu_queue_), index);
}
}
}
// Write valid header for the first packet in the batch
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
// Ring doorbell for this batch
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
processedPackets += batchSize;
TrackQueueProgress(*packets[processedPackets - 1], startIndex + batchSize - 1);
// Double the batch size for next iteration, cap at DEBUG_HIP_GRAPH_BATCH_SIZE
if (batchSize < kMaxBatchSize) {
batchSize *= 2;
}
}
// Mark the flag indicating if a dispatch is outstanding
hasPendingDispatch_ = true;
// Wait on signal for the last packet if blocking
if (blocking) {
LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
if (!Barriers().WaitCurrent()) {
LogPrintfError("Failed blocking queue wait with signal [0x%lx]",
packets.back()->completion_signal.handle);
return false;
}
}
return true;
}
// ================================================================================================
bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
const std::vector<std::string>& kernelNames,
amd::AccumulateCommand* vcmd) {
if (vcmd == nullptr || packets.empty() || packets.size() != kernelNames.size()) {
return false;
}
amd::ScopedLock lock(execution());
profilingBegin(*vcmd);
dispatchBlockingWait();
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Graph shader name : %s", kernelName.c_str());
// The Aqlpacket with valid header will trigger the issue that AQL fill
// the header before filling the body. However, the CP can handle the AQL package
// after seeing the valid AQL header with the AQL package's body is NULL.
// This patch fixes this potential issue that filling AQL header before
// filling the AQL body.
uint16_t packetHeader = packet->header;
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
packet->header = packetHeader;
// Add all kernel names in bulk
vcmd->addKernelNames(kernelNames);
// Dispatch all packets with a single doorbell ring
// Cast packets vector to AQL packets vector on the fly
const auto& aqlPackets =
reinterpret_cast<const std::vector<hsa_kernel_dispatch_packet_t*>&>(packets);
bool result = dispatchGenericAqlPacketBatch(aqlPackets, false, false, &kernelNames);
profilingEnd();
return true;
return result;
}
// ================================================================================================
+10 -3
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -468,16 +468,23 @@ class VirtualGPU : public device::VirtualDevice {
//! Dispatches a barrier with blocking HSA signals
void dispatchBlockingWait();
inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd = nullptr);
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking = true, bool capturing = false,
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking = true, bool attach_signal = false);
//! Dispatches multiple AQL packets in a single batch operation
bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
const std::vector<std::string>& kernelNames,
amd::AccumulateCommand* vcmd = nullptr);
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking,
bool attach_signal = false);
//! Dispatches multiple AQL packets with a single doorbell ring
template <typename AqlPacket> bool dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
bool blocking, bool attach_signal = false,
const std::vector<std::string>* kernelNames = nullptr);
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
+6 -1
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2010 - 2024 Advanced Micro Devices, Inc.
/* Copyright (c) 2010 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -1394,6 +1394,11 @@ class AccumulateCommand : public Command {
//! Add kernel name to the list if available
void addKernelName(const std::string& kernelName) { kernelNames_.push_back(kernelName); }
//! Add multiple kernel names in bulk
void addKernelNames(const std::vector<std::string>& kernelNames) {
kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
}
//! Add kernel timestamp to the list if available
void addTimestamps(uint64_t startTs, uint64_t endTs) {
tsList_.push_back(std::make_pair(startTs, endTs));
+3 -1
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2009 - 2021 Advanced Micro Devices, Inc.
/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -255,6 +255,8 @@ release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
"Forces the number of streams for the graph parallel execution") \
release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256, \
"Number of graph nodes to batch at a time") \
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
"Blocks synchronization on CPU until the callback processing is done")\
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \