diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index c64523b3dc..b23521e1fb 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. +/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -394,9 +394,20 @@ void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) { } // ================================================================================================ -hipError_t GraphExec::AllocKernelArgForGraphNode() { +hipError_t GraphExec::CaptureAndFormPacketsForGraph() { hipError_t status = hipSuccess; - for (auto& node : topoOrder_) { + + // Clear previous capture status and batches + nodeCaptureStatus_.clear(); + nodeCaptureStatus_.resize(topoOrder_.size(), false); + + // Clear previous batches + packetBatches_.clear(); + + // Process nodes and create batches of consecutive captured nodes + for (size_t i = 0; i < topoOrder_.size(); ++i) { + auto& node = topoOrder_[i]; + if (node->GetType() == hipGraphNodeTypeKernel) { // Check if graph requires hidden heap and set as part of graphExec param. static bool initialized = false; @@ -405,19 +416,51 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() { initialized = true; } } + if (node->GraphCaptureEnabled()) { - status = node->CaptureAndFormPacket(GetKernelArgManager()); + // Start of a potential batch - try to capture packets for this node + std::vector currentBatch; + std::vector currentKernelNames; + + // Collect packets from consecutive captured nodes + size_t j = i; + size_t capturedNodeCount = 0; + while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) { + auto& currentNode = topoOrder_[j]; + status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), ¤tBatch, + ¤tKernelNames); + + if (status != hipSuccess || currentBatch.empty()) { + LogError("Packet capture failed"); + return status; + } + // Mark this node as successfully captured + nodeCaptureStatus_[j] = true; + ++j; + ++capturedNodeCount; + } + + // Add the batch if it has packets + if (!currentBatch.empty()) { + packetBatches_.emplace_back(std::move(currentBatch), std::move(currentKernelNames), + capturedNodeCount); + } + + // Skip the nodes we just processed, the index will be incremented by the loop + i = j - 1; } else if (node->GetType() == hipGraphNodeTypeGraph) { auto childNode = reinterpret_cast(node); if (childNode->GetChildGraph()->max_streams_ == 1) { childNode->SetGraphCaptureStatus(true); - status = childNode->AllocKernelArgForGraphNode(); + status = childNode->CaptureAndFormPacketsForGraph(); + nodeCaptureStatus_[i] = (status == hipSuccess); if (status != hipSuccess) { - return status; + status = hipSuccess; // Continue with other nodes } } } } + return status; } @@ -427,8 +470,8 @@ hipError_t GraphExec::CaptureAQLPackets() { size_t kernArgSizeForGraph = 0; GetKernelArgSizeForGraph(kernArgSizeForGraph); // When we support multi device graph lauch we need to allocate the kenel args on respective - // device for each kernel Assume graph has nodes of same device allocate kernel args on the device - // from the first node + // device for each kernel Assume graph has nodes of same device allocate kernel args on the + // device from the first node auto device = g_devices[topoOrder_[0]->GetDeviceId()]->devices()[0]; // Add a larger initial pool to accomodate for any updates to kernel args bool bStatus = @@ -437,10 +480,11 @@ hipError_t GraphExec::CaptureAQLPackets() { return hipErrorMemoryAllocation; } - status = AllocKernelArgForGraphNode(); + status = CaptureAndFormPacketsForGraph(); if (status != hipSuccess) { return status; } + kernArgManager_->ReadBackOrFlush(); return status; } @@ -471,18 +515,37 @@ hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } - for (int i = 0; i < topoOrder_.size(); i++) { - if (topoOrder_[i]->GraphCaptureEnabled()) { - if (topoOrder_[i]->GetEnabled()) { - std::vector& gpuPackets = topoOrder_[i]->GetAqlPackets(); - for (auto& packet : gpuPackets) { - hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate); + + size_t batchIndex = 0; + + // Process nodes in topological order with mixed execution strategy + for (size_t i = 0; i < topoOrder_.size(); ++i) { + auto& node = topoOrder_[i]; + + if (!node->GraphCaptureEnabled()) { + // Node doesn't support capture - execute individually + node->SetStream(hip_stream); + status = node->CreateCommand(node->GetQueue()); + node->EnqueueCommands(hip_stream); + } else if (i < nodeCaptureStatus_.size() && nodeCaptureStatus_[i]) { + // Node was successfully captured - find which batch it belongs to + // and dispatch the entire batch + if (batchIndex < packetBatches_.size()) { + // Dispatch this batch + bool batchStatus = hip_stream->vdev()->dispatchAqlPacketBatch( + packetBatches_[batchIndex].packets, packetBatches_[batchIndex].kernelNames, accumulate); + if (!batchStatus) { + status = hipErrorUnknown; + accumulate->release(); + return status; } + + // Skip all consecutive captured nodes that belong to this batch + // Use the tracked node count to skip directly instead of parsing one by one + i += packetBatches_[batchIndex].capturedNodeCount - 1; // -1 because loop will increment + + ++batchIndex; } - } else { - topoOrder_[i]->SetStream(hip_stream); - status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); - topoOrder_[i]->EnqueueCommands(hip_stream); } } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index babcb5f87c..c6767508c9 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. +/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -245,15 +245,22 @@ class GraphNode : public hipGraphNodeDOTAttribute { size_t GetKerArgSize() const { return alignedKernArgSize_; } size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } - hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr) { + + //! Capture packets and accumulate them into a batch if provided + hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr, + std::vector* batchPackets = nullptr, + std::vector* batchKernelNames = nullptr) { auto capture_stream = hip::getNullStream(g_devices[dev_id_]->devices()[0]->context(), false); hipError_t status = CreateCommand(capture_stream); if (status != hipSuccess) { return status; } + // Release last created packet memory before they are overwritten with new packets std::for_each(gpuPackets_.begin(), gpuPackets_.end(), [](auto p) { delete[] p; }); + // Clear the pointer array gpuPackets_.clear(); + for (auto& command : commands_) { command->setPktCapturingState(true, &gpuPackets_, kernArgMgr, &capturedKernelName_); // Enqueue command to capture GPU Packet. The packet is not submitted to the device. @@ -261,6 +268,15 @@ class GraphNode : public hipGraphNodeDOTAttribute { command->submit(*(command->queue())->vdev()); command->release(); } + + // Accumulate packets directly into the batch (only if batch vectors are provided) + if (batchPackets != nullptr && batchKernelNames != nullptr) { + for (auto& packet : gpuPackets_) { + batchPackets->push_back(packet); + batchKernelNames->push_back(capturedKernelName_); + } + } + // Commands are captured and released. Clear them from the object. commands_.clear(); @@ -814,6 +830,9 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph { if (instantiateDeviceId_ != -1) { static_cast(g_devices[instantiateDeviceId_])->release(); } + + packetBatches_.clear(); + nodeCaptureStatus_.clear(); } Node GetClonedNode(Node node) { @@ -849,7 +868,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph { } GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; } static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data); - hipError_t AllocKernelArgForGraphNode(); + hipError_t CaptureAndFormPacketsForGraph(); void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph); hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream); bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); } @@ -863,6 +882,23 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph { int instantiateDeviceId_ = -1; bool hasHiddenHeap_ = false; //!< Hidden heap indicator for Kernel node bool repeatLaunch_ = false; + + //! Structure for batch dispatch optimization - packets and kernel names in aligned memory + struct PacketBatch { + std::vector packets; + std::vector kernelNames; + size_t capturedNodeCount; // Number of consecutive captured nodes in this batch + + PacketBatch() : capturedNodeCount(0) {} + PacketBatch(std::vector&& p, std::vector&& k, size_t nodeCount) + : packets(std::move(p)), kernelNames(std::move(k)), capturedNodeCount(nodeCount) {} + }; + + //! Batches of accumulated packets and kernel names for batch dispatch optimization + //! Each batch contains packets from consecutive captured nodes + std::vector packetBatches_; + //! Track which nodes were successfully captured (true) vs need individual execution (false) + std::vector nodeCaptureStatus_; }; class ChildGraphNode : public GraphNode, public GraphExec { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index c6660bc30e..25d7ea9825 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc. +/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -1351,10 +1351,11 @@ class VirtualDevice : public amd::HeapObject { virtual bool isFenceDirty() const = 0; //! Init hidden heap for device memory allocations virtual void HiddenHeapInit() = 0; - //! Dispatch captured AQL packet - virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, - amd::AccumulateCommand* vcmd = nullptr) = 0; + //! Dispatches multiple AQL packets in a single batch operation + virtual bool dispatchAqlPacketBatch(const std::vector& packets, + const std::vector& kernelNames, + amd::AccumulateCommand* vcmd = nullptr) = 0 ; //! Returns the number of outstanding HSA async handlers std::atomic& QueuedAsyncHandlers() const { return queued_async_handlers_; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index dfb0b63632..115d929719 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. +/* Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -355,9 +355,10 @@ class VirtualGPU : public device::VirtualDevice { void HiddenHeapInit() {} - inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, - amd::AccumulateCommand* vcmd = nullptr) { - vcmd->addKernelName(kernelName); + //! Dispatches multiple AQL packets in a single batch operation + bool dispatchAqlPacketBatch(const std::vector& packets, + const std::vector& kernelNames, + amd::AccumulateCommand* vcmd = nullptr) { return false; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 8dbdcd794d..89153e20fc 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc. +/* Copyright (c) 2013 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -436,8 +436,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va // Peep signal +2 ahead to see if its done auto temp_id = (current_id_ + 2) % signal_list_.size(); - // If GPU is still busy with processing, then add more signals to avoid more frequent stalls - if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) { + // If GPU is still busy with processing or if timestamps havent been saved out, + // then add more signals to avoid more frequent stalls + if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0 || + !signal_list_[temp_id]->flags_.done_) { std::unique_ptr signal(new ProfilingSignal()); if ((signal != nullptr) && CreateSignal(signal.get())) { // Find valid new index @@ -453,10 +455,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va if (!new_signal) { // Find valid index ++current_id_ %= signal_list_.size(); - // Make sure the previous operation on the current signal is done WaitCurrent(); + size_t next = (current_id_ + 1) % signal_list_.size(); // Have to wait the next signal in the queue to avoid a race condition between // a GPU waiter(which may be not triggered yet) and CPU signal reset below WaitNext(); @@ -1009,7 +1011,6 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui // Check for queue full and wait if needed. uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); - uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); fence_dirty_ = true; if (addSystemScope_) { @@ -1055,16 +1056,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui current_signal->flags_.isPacketDispatch_ = true; } - // Make sure the slot is free for usage while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) { - amd::Os::yield(); + // Active spin - no yield } // Add blocking command if the original value of read index was behind of the queue size. // Note: direct dispatch relies on the slot stall above to keep the forward progress // of the app if a dispatched kernel requires some CPU input for completion - if (blocking || (!AMD_DIRECT_DISPATCH && (index - read) >= sw_queue_size)) { + if (blocking || (!AMD_DIRECT_DISPATCH && + (index - hsa_queue_load_read_index_relaxed(gpu_queue_)) >= sw_queue_size)) { if (packet->completion_signal.handle == 0) { packet->completion_signal = Barriers().ActiveSignal(); } @@ -1102,7 +1103,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui reinterpret_cast(packet)->kernel_object, reinterpret_cast(packet)->kernarg_address, reinterpret_cast(packet)->completion_signal, - reinterpret_cast(packet)->reserved2, read, index); + reinterpret_cast(packet)->reserved2, + hsa_queue_load_read_index_scacquire(gpu_queue_), index); hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); @@ -1158,34 +1160,229 @@ bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t he } // ================================================================================================ -inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, - amd::AccumulateCommand* vcmd) { - if (vcmd == nullptr) { +template +bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector& packets, + bool blocking, bool attach_signal, + const std::vector* kernelNames) { + if (packets.empty()) { return false; } - vcmd->addKernelName(kernelName); - amd::ScopedLock lock(execution()); + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; + const uint32_t sw_queue_size = queueMask; + const size_t numPackets = packets.size(); + size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE; + const size_t kGpuLagPackets = 16; + // Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE + size_t processedPackets = 0; + size_t batchSize = 1; + + while (processedPackets < numPackets) { + uint64_t currentReadIndex = hsa_queue_load_read_index_scacquire(gpu_queue_); + uint64_t currentWriteIndex = hsa_queue_load_write_index_relaxed(gpu_queue_); + + if (currentWriteIndex - currentReadIndex >= kGpuLagPackets) { + //GPU is busy, so we can copy more packets + batchSize = DEBUG_HIP_GRAPH_BATCH_SIZE; + } + + // Process all remaining packets in one batch + if (processedPackets + batchSize > numPackets) { + batchSize = numPackets - processedPackets; + } + + // Check if we have enough space in the queue for this batch + // If queue is full, reset batch size to 1 and wait + if (currentWriteIndex + batchSize - currentReadIndex >= sw_queue_size) { + batchSize = 1; + } + + // Now reserve space for the batch + uint64_t startIndex = hsa_queue_add_write_index_screlease(gpu_queue_, batchSize); + + // Make sure the slot is free for usage + while ((startIndex - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) { + // Active spin - no yield + } + + fence_dirty_ = true; + + // Save header of first packet in this batch + AqlPacket* firstPacket = packets[processedPackets]; + uint16_t firstPacketHeader = firstPacket->header; + uint16_t firstPacketRest = firstPacket->setup; + + // Process batchSize packets + for (size_t i = 0; i < batchSize; ++i) { + size_t packetIndex = processedPackets + i; + uint64_t index = startIndex + i; + + AqlPacket* packet = packets[packetIndex]; + uint16_t header = packet->header; + + + bool attachSignal = timestamp_ != nullptr || attach_signal; + + packet->completion_signal = + Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal); + + if (std::is_same::value && + timestamp_ != nullptr) { + // If profiling is enabled, store the correlation ID in the dispatch packet + if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) { + auto dispatchPacket = reinterpret_cast(packet); + dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_; + } + + ProfilingSignal* current_signal = Barriers().GetLastSignal(); + current_signal->flags_.isPacketDispatch_ = true; + } + + //Add blocking command if needed (only for the last packet) + if (blocking && (packetIndex == numPackets - 1)) { + if (packet->completion_signal.handle == 0) { + packet->completion_signal = Barriers().ActiveSignal(); + } + } + + AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; + + // For first packet in batch, invalidate header before writing + if (i == 0) { + if (addSystemScope_) { + // Add system scope on the acq on first packet + firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE); + firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE); + } + packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE); + + // Copy the packet and then write the valid of the first packet + *aql_loc = *packet; + + // Restore the header of the first packet + packet->header = firstPacketHeader; + } else { + // For the end packet in batch set flags + if (i == batchSize - 1) { + if (addSystemScope_) { + // Add system scope on the release on last packet + packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + addSystemScope_ = false; + } + auto expected_fence_state = + extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + // Reset fence_dirty_ flag if we submit a packet with system scopes + if (expected_fence_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } + fence_state_ = static_cast(expected_fence_state); + } + + // Copy the packet to the queue + *aql_loc = *packet; + } + + // Print kernel name for kernel dispatch packets + if (kernelNames && packetIndex < kernelNames->size()) { + uint8_t packetType = + extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE); + if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s", + (*kernelNames)[packetIndex].c_str()); + + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, + "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = " + "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], " + "private_seg_size=%zu, group_seg_size=%zu, kernel_obj=0x%zx, " + "kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, " + "rptr=%u, wptr=%u", + gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType, + extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, + HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + packet->setup, + reinterpret_cast(packet)->grid_size_x, + reinterpret_cast(packet)->grid_size_y, + reinterpret_cast(packet)->grid_size_z, + reinterpret_cast(packet)->workgroup_size_x, + reinterpret_cast(packet)->workgroup_size_y, + reinterpret_cast(packet)->workgroup_size_z, + reinterpret_cast(packet)->private_segment_size, + reinterpret_cast(packet)->group_segment_size, + reinterpret_cast(packet)->kernel_object, + reinterpret_cast(packet)->kernarg_address, + reinterpret_cast(packet)->completion_signal, + reinterpret_cast(packet)->reserved2, + hsa_queue_load_read_index_scacquire(gpu_queue_), index); + } + } + } + + // Write valid header for the first packet in the batch + AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask]; + packet_store_release(reinterpret_cast(aql_loc), firstPacketHeader, firstPacketRest); + + // Ring doorbell for this batch + hsa_signal_store_screlease(gpu_queue_->doorbell_signal, startIndex); + + processedPackets += batchSize; + + TrackQueueProgress(*packets[processedPackets - 1], startIndex + batchSize - 1); + // Double the batch size for next iteration, cap at DEBUG_HIP_GRAPH_BATCH_SIZE + if (batchSize < kMaxBatchSize) { + batchSize *= 2; + } + } + + // Mark the flag indicating if a dispatch is outstanding + hasPendingDispatch_ = true; + + // Wait on signal for the last packet if blocking + if (blocking) { + LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!"); + if (!Barriers().WaitCurrent()) { + LogPrintfError("Failed blocking queue wait with signal [0x%lx]", + packets.back()->completion_signal.handle); + return false; + } + } + + return true; +} + +// ================================================================================================ +bool VirtualGPU::dispatchAqlPacketBatch(const std::vector& packets, + const std::vector& kernelNames, + amd::AccumulateCommand* vcmd) { + if (vcmd == nullptr || packets.empty() || packets.size() != kernelNames.size()) { + return false; + } + + amd::ScopedLock lock(execution()); profilingBegin(*vcmd); dispatchBlockingWait(); - auto packet = reinterpret_cast(aqlpacket); - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Graph shader name : %s", kernelName.c_str()); - // The Aqlpacket with valid header will trigger the issue that AQL fill - // the header before filling the body. However, the CP can handle the AQL package - // after seeing the valid AQL header with the AQL package's body is NULL. - // This patch fixes this potential issue that filling AQL header before - // filling the AQL body. - uint16_t packetHeader = packet->header; - packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE); - dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false); - packet->header = packetHeader; + // Add all kernel names in bulk + vcmd->addKernelNames(kernelNames); + + // Dispatch all packets with a single doorbell ring + // Cast packets vector to AQL packets vector on the fly + const auto& aqlPackets = + reinterpret_cast&>(packets); + bool result = dispatchGenericAqlPacketBatch(aqlPackets, false, false, &kernelNames); profilingEnd(); - return true; + return result; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index f12116e425..3ade98d3a7 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc. +/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -468,16 +468,23 @@ class VirtualGPU : public device::VirtualDevice { //! Dispatches a barrier with blocking HSA signals void dispatchBlockingWait(); - inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, - amd::AccumulateCommand* vcmd = nullptr); bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true, bool capturing = false, const uint8_t* aqlPacket = nullptr, bool attach_signal = false); bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true, bool attach_signal = false); + + //! Dispatches multiple AQL packets in a single batch operation + bool dispatchAqlPacketBatch(const std::vector& packets, + const std::vector& kernelNames, + amd::AccumulateCommand* vcmd = nullptr); template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, bool attach_signal = false); + //! Dispatches multiple AQL packets with a single doorbell ring + template bool dispatchGenericAqlPacketBatch(const std::vector& packets, + bool blocking, bool attach_signal = false, + const std::vector* kernelNames = nullptr); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 6070f94548..73ed76ea34 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2010 - 2024 Advanced Micro Devices, Inc. +/* Copyright (c) 2010 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -1394,6 +1394,11 @@ class AccumulateCommand : public Command { //! Add kernel name to the list if available void addKernelName(const std::string& kernelName) { kernelNames_.push_back(kernelName); } + //! Add multiple kernel names in bulk + void addKernelNames(const std::vector& kernelNames) { + kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end()); + } + //! Add kernel timestamp to the list if available void addTimestamps(uint64_t startTs, uint64_t endTs) { tsList_.push_back(std::make_pair(startTs, endTs)); diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 45248fbbbb..a8e8daa7f2 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2009 - 2021 Advanced Micro Devices, Inc. +/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -255,6 +255,8 @@ release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \ "Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \ "Forces the number of streams for the graph parallel execution") \ +release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256, \ + "Number of graph nodes to batch at a time") \ release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \ "Blocks synchronization on CPU until the callback processing is done")\ release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \