SWDEV-524745 - Part-III Add multi device support for hip graph (#814)

- Retrieve the list of devices linked to each branch using stream ID x.
- Identify the necessary streams for each device to facilitate graph execution.
- Create the necessary streams for each device to ensure successful graph execution.
- Implement support for launching a multi-device, single-branch graph.

Co-authored-by: Anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
このコミットが含まれているのは:
Godavarthy Surya, Anusha
2025-10-10 10:27:27 +05:30
committed by GitHub
コミット d3cc2c7668
4個のファイルの変更455行の追加134行の削除
+250 -91
ファイルの表示
@@ -189,7 +189,8 @@ void Graph::ScheduleOneNode(Node node, int stream_id) {
// Assign active stream to the current node
node->stream_id_ = stream_id;
max_streams_ = std::max(max_streams_, (stream_id + 1));
// Track which devices are used by each stream for multi-device graph execution
streams_dev_ids_[stream_id].insert(node->dev_id_);
// Process child graph separately, since, there is no connection
if (node->GetType() == hipGraphNodeTypeGraph) {
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
@@ -317,7 +318,7 @@ void Graph::clone(Graph* newGraph, bool cloneNodes) const {
// ================================================================================================
Graph* Graph::clone() const {
Graph* newGraph = new Graph(device_);
Graph* newGraph = new Graph(getCurrentDevice());
clone(newGraph);
return newGraph;
}
@@ -332,39 +333,115 @@ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
}
// ================================================================================================
hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
hipError_t GraphExec::CreateStreams(uint32_t num_streams, int devId) {
amd::ScopedLock lock(graphExecStreamCreateLock_);
parallel_streams_.reserve(num_streams);
// Validate input parameters
if (num_streams == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_CODE,
"[hipGraph] Attempting to create 0 streams for device %d", devId);
return hipSuccess;
}
if (devId < 0 || devId >= g_devices.size() || g_devices[devId] == nullptr) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Invalid device ID %d for stream creation",
devId);
return hipErrorInvalidDevice;
}
// Check if streams already exist for this device
if (parallel_streams_.find(devId) != parallel_streams_.end() &&
!parallel_streams_[devId].empty()) {
ClPrint(amd::LOG_WARNING, amd::LOG_CODE,
"[hipGraph] Streams already exist for device %d, skipping creation", devId);
return hipSuccess;
}
parallel_streams_[devId].reserve(num_streams);
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Creating %u parallel streams for device %d",
num_streams, devId);
for (uint32_t i = 0; i < num_streams; ++i) {
auto stream = new hip::Stream(hip::getCurrentDevice(), hip::Stream::Priority::Normal,
hipStreamNonBlocking);
auto stream =
new hip::Stream(g_devices[devId], hip::Stream::Priority::Normal, hipStreamNonBlocking);
if (stream == nullptr || !stream->Create()) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to %s stream %u for device %d",
stream == nullptr ? "allocate" : "create", i, devId);
if (stream != nullptr) {
hip::Stream::Destroy(stream);
}
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!");
// Clean up any previously created streams for this device
for (auto& created_stream : parallel_streams_[devId]) {
hip::Stream::Destroy(created_stream);
}
parallel_streams_[devId].clear();
return hipErrorOutOfMemory;
}
parallel_streams_.push_back(stream);
parallel_streams_[devId].push_back(stream);
}
return hipSuccess;
}
void GraphExec::FindStreamsReqPerDev() {
// Count streams required per device based on stream-to-device mappings
for (auto const& [stream_id, dev_ids] : streams_dev_ids_) {
for (auto dev_id : dev_ids) {
max_streams_dev_[dev_id]++;
}
}
// Recursively process child graphs to determine their stream requirements
for (auto node : vertices_) {
if (node->GetType() == hipGraphNodeTypeGraph) {
auto childNode = reinterpret_cast<ChildGraphNode*>(node);
// Recursively find stream requirements for child graph
childNode->FindStreamsReqPerDev();
// Merge child graph's stream requirements with parent graph
// Take the maximum streams needed per device to handle concurrent execution
for (auto const& [dev_id, num_streams] : childNode->max_streams_dev_) {
auto it = max_streams_dev_.find(dev_id);
if (it != max_streams_dev_.end()) {
// Device already has stream requirements - take the maximum
max_streams_dev_[dev_id] = std::max(max_streams_dev_[dev_id], num_streams);
} else {
// New device - initialize with child graph's requirement
max_streams_dev_[dev_id] = num_streams;
}
}
}
}
}
// ================================================================================================
hipError_t GraphExec::Init() {
hipError_t status = hipSuccess;
// create extra stream to avoid queue collision with the default execution stream
if (max_streams_ > 1) {
status = CreateStreams(max_streams_);
}
if (status != hipSuccess) {
return status;
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (max_streams_ == 1) {
if (max_streams_ == 1) {
FindStreamsReqPerDev();
if (max_streams_dev_.size() > 1) {
// Multi-device graph detected - create parallel streams for each device
for (auto const& [dev_id, num_streams] : max_streams_dev_) {
ClPrint(amd::LOG_INFO, amd::LOG_API,
"[hipGraph] For device id :%d max streams :%d for execution.\n", dev_id,
num_streams);
status = CreateStreams(num_streams, dev_id);
if (status != hipSuccess) {
return status;
}
}
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
// For graph nodes capture AQL packets to dispatch them directly during graph launch.
status = CaptureAQLPackets();
}
} else {
status = CreateStreams(max_streams_, hip::getCurrentDevice()->deviceId());
}
instantiateDeviceId_ = hip::getCurrentDevice()->deviceId();
static_cast<ReferenceCountedObject*>(hip::getCurrentDevice())->retain();
@@ -438,8 +515,8 @@ hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
for (size_t i = 0; i < topoOrder_.size(); ++i) {
auto& node = topoOrder_[i];
// Check if kernel node requires hidden heap and set it for the entire graph
if (node->GetType() == hipGraphNodeTypeKernel) {
// Check if graph requires hidden heap and set as part of graphExec param.
static bool initialized = false;
if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
SetHiddenHeap();
@@ -447,54 +524,66 @@ hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
}
}
// Handle nodes that support graph capture
if (node->GraphCaptureEnabled()) {
// Start of a new batch
PacketBatch newBatch;
size_t j = i;
// TODO: Add support for batching for multi-device linear graph
if (max_streams_dev_.size() == 1) {
// Single device - use batching optimization
// Start of a new batch
PacketBatch newBatch;
size_t j = i;
// Collect packets from consecutive captured nodes
while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) {
auto& currentNode = topoOrder_[j];
// Collect packets from consecutive captured nodes
while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) {
auto& currentNode = topoOrder_[j];
// Capture packets for this node
std::vector<uint8_t*> nodePackets;
std::vector<std::string> nodeKernelNames;
status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), &nodePackets,
&nodeKernelNames);
// Capture packets for this node
std::vector<uint8_t*> nodePackets;
std::vector<std::string> nodeKernelNames;
status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), &nodePackets,
&nodeKernelNames);
if (status != hipSuccess || nodePackets.empty()) {
LogError("Packet capture failed");
return status;
if (status != hipSuccess || nodePackets.empty()) {
LogError("Packet capture failed");
return status;
}
// Create NodeRange for this node
PacketBatch::NodeRange range;
range.startIndex = newBatch.dispatchPackets.size();
range.packetCount = nodePackets.size();
range.enabled = true;
// Add to dispatch lists (initially all enabled)
newBatch.dispatchPackets.insert(newBatch.dispatchPackets.end(), nodePackets.begin(),
nodePackets.end());
newBatch.dispatchKernelNames.insert(newBatch.dispatchKernelNames.end(),
nodeKernelNames.begin(), nodeKernelNames.end());
// Store node mapping
newBatch.nodeRanges.push_back(range);
newBatch.nodeToRangeIndex[currentNode] = newBatch.nodeRanges.size() - 1;
// Mark this node as successfully captured
nodeCaptureStatus_[j] = true;
++j;
}
// Create NodeRange for this node
PacketBatch::NodeRange range;
range.startIndex = newBatch.dispatchPackets.size();
range.packetCount = nodePackets.size();
range.enabled = true;
// Add the batch if it has packets
if (!newBatch.dispatchPackets.empty()) {
packetBatches_.emplace_back(std::move(newBatch));
}
// Add to dispatch lists (initially all enabled)
newBatch.dispatchPackets.insert(newBatch.dispatchPackets.end(),
nodePackets.begin(), nodePackets.end());
newBatch.dispatchKernelNames.insert(newBatch.dispatchKernelNames.end(),
nodeKernelNames.begin(), nodeKernelNames.end());
// Store node mapping
newBatch.nodeRanges.push_back(range);
newBatch.nodeToRangeIndex[currentNode] = newBatch.nodeRanges.size() - 1;
// Mark this node as successfully captured
nodeCaptureStatus_[j] = true;
++j;
// Skip the nodes we just processed, the index will be incremented by the loop
i = j - 1;
} else {
// Multi-device - capture individual packets without batching
status = node->CaptureAndFormPacket(GetKernelArgManager());
if (status != hipSuccess) {
LogError("Individual packet capture failed for multi-device node");
return status;
}
}
// Add the batch if it has packets
if (!newBatch.dispatchPackets.empty()) {
packetBatches_.emplace_back(std::move(newBatch));
}
// Skip the nodes we just processed, the index will be incremented by the loop
i = j - 1;
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
if (childNode->GetChildGraph()->max_streams_ == 1) {
@@ -502,7 +591,8 @@ hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
status = childNode->CaptureAndFormPacketsForGraph();
nodeCaptureStatus_[i] = (status == hipSuccess);
if (status != hipSuccess) {
status = hipSuccess; // Continue with other nodes
LogWarning("Child graph packet capture failed continuing with other nodes");
status = hipSuccess; // Continue processing other nodes
}
}
}
@@ -555,38 +645,43 @@ hipError_t GraphExec::UpdateAQLPacket(hip::GraphNode* node) {
if (max_streams_ != 1 || !node->GraphCaptureEnabled()) {
return hipSuccess;
}
//ToDo: Add batching support for multi-device linear graph
if (max_streams_dev_.size() == 1) {
// Find which batch contains this node and update it
for (auto& batch : packetBatches_) {
auto it = batch.nodeToRangeIndex.find(node);
if (it != batch.nodeToRangeIndex.end()) {
// Found the batch containing this node - update packets
PacketBatch::NodeRange& range = batch.nodeRanges[it->second];
// Find which batch contains this node and update it
for (auto& batch : packetBatches_) {
auto it = batch.nodeToRangeIndex.find(node);
if (it != batch.nodeToRangeIndex.end()) {
// Found the batch containing this node - update packets
PacketBatch::NodeRange& range = batch.nodeRanges[it->second];
// Capture new packets for this node
std::vector<uint8_t*> newPackets;
std::vector<std::string> newKernelNames;
hipError_t status = node->CaptureAndFormPacket(kernArgManager_, &newPackets, &newKernelNames);
if (status != hipSuccess) {
return status;
// Capture new packets for this node
std::vector<uint8_t*> newPackets;
std::vector<std::string> newKernelNames;
hipError_t status =
node->CaptureAndFormPacket(kernArgManager_, &newPackets, &newKernelNames);
if (status != hipSuccess) {
return status;
}
// Update dispatch packets (always update regardless of enabled state)
// The enabled/disabled check happens during dispatch, not here
for (size_t i = 0; i < range.packetCount && i < newPackets.size(); ++i) {
size_t packetIndex = range.startIndex + i;
batch.dispatchPackets[packetIndex] = newPackets[i];
batch.dispatchKernelNames[packetIndex] = newKernelNames[i];
}
return hipSuccess;
}
// Update dispatch packets (always update regardless of enabled state)
// The enabled/disabled check happens during dispatch, not here
for (size_t i = 0; i < range.packetCount && i < newPackets.size(); ++i) {
size_t packetIndex = range.startIndex + i;
batch.dispatchPackets[packetIndex] = newPackets[i];
batch.dispatchKernelNames[packetIndex] = newKernelNames[i];
}
return hipSuccess;
}
} else {
return node->CaptureAndFormPacket(kernArgManager_);
}
return hipSuccess; // Node not in any batch
}
// ================================================================================================
hipError_t GraphExec::UpdatePacketBatchesForNodeEnableDisable(hip::GraphNode* node, bool isEnabled) {
if (max_streams_ != 1 || !node->GraphCaptureEnabled()) {
// Only handle single stream case with captured nodes
if (max_streams_ != 1 && max_streams_dev_.size() == 1 && !node->GraphCaptureEnabled()) {
// Only handle single stream and single device case with captured nodes
return hipSuccess;
}
// Find which batch contains this node and update its enabled state
@@ -687,10 +782,77 @@ hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) {
}
return status;
}
// ================================================================================================
hipError_t GraphExec::EnqueueMultiDeviceLinearGraph(hip::Stream* launch_stream) {
// Accumulate command tracks all the AQL packet batch that we submit to the HW. For now we track
// only kernel nodes.
amd::AccumulateCommand* accumulate = nullptr;
hipError_t status = hipSuccess;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate = new amd::AccumulateCommand(*launch_stream, {}, nullptr);
}
auto createMarkerAndWait = [](hip::Stream* fromStream, hip::Stream* toStream) {
amd::Command::EventWaitList wait_list;
auto marker = new amd::Marker(*fromStream, true, wait_list);
marker->enqueue();
marker->release();
wait_list.push_back(marker);
auto wait_marker = new amd::Marker(*toStream, true, wait_list);
wait_marker->enqueue();
wait_marker->release();
};
hip::Stream* prevStream = launch_stream;
size_t batchIndex = 0;
for (size_t i = 0; i < topoOrder_.size(); ++i) {
auto& node = topoOrder_[i];
hip::Stream* currStream = parallel_streams_[node->dev_id_][0];
// Insert synchronization marker if switching devices
if (prevStream->DeviceId() != currStream->DeviceId()) {
createMarkerAndWait(prevStream, currStream);
}
// ToDo : Add batching for multi device graph launch
if (topoOrder_[i]->GraphCaptureEnabled()) {
if (topoOrder_[i]->GetEnabled()) {
std::vector<uint8_t*>& gpuPackets = topoOrder_[i]->GetAqlPackets();
std::vector<std::string> kernelNames;
for (auto& packet : gpuPackets) {
kernelNames.push_back(topoOrder_[i]->GetKernelName());
}
currStream->vdev()->dispatchAqlPacketBatch(gpuPackets, kernelNames, accumulate);
}
} else {
topoOrder_[i]->SetStream(currStream);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
topoOrder_[i]->EnqueueCommands(currStream);
}
prevStream = currStream;
}
// Synchronize back to launch stream if we ended on a different device
if (prevStream != launch_stream) {
createMarkerAndWait(prevStream, launch_stream);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate->enqueue();
accumulate->release();
}
return status;
}
// ================================================================================================
hipError_t Graph::UpdateStreams(hip::Stream* launch_stream,
const std::vector<hip::Stream*>& parallel_streams) {
void GraphExec::UpdateStreams(hip::Stream* launch_stream) {
int devId = launch_stream->vdev()->device().index();
if (parallel_streams_.find(devId) == parallel_streams_.end()) {
LogPrintfError("UpdateStreams failed for device id:%d", devId);
return;
}
auto parallel_streams = parallel_streams_[devId];
// Current stream is the default in the assignment
streams_.push_back(launch_stream);
std::unordered_map<int, int> unique_stream_ids;
@@ -710,7 +872,6 @@ hipError_t Graph::UpdateStreams(hip::Stream* launch_stream,
for (int i = streams_.size(), j = 0; i < max_streams_ && j < collided_streams.size(); i++, j++) {
streams_.push_back(collided_streams[j]);
}
return hipSuccess;
}
@@ -886,7 +1047,6 @@ bool Graph::RunNodes(int32_t base_stream, const std::vector<hip::Stream*>* paral
// ================================================================================================
hipError_t GraphExec::Run(hip::Stream* launch_stream) {
hipError_t status = hipSuccess;
if (flags_ & hipGraphInstantiateFlagAutoFreeOnLaunch) {
if (!topoOrder_.empty()) {
topoOrder_[0]->GetParentGraph()->FreeAllMemory(launch_stream);
@@ -908,13 +1068,13 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
} else {
repeatLaunch_ = true;
}
ClPrint(amd::LOG_DEBUG, amd::LOG_CODE,
"GraphExec::Run max_streams: %d, "
"on device: %d, total number of nodes: %d",
max_streams_, launch_stream->DeviceId(), topoOrder_.size());
if (max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) {
if (max_streams_ == 1 && max_streams_dev_.size() == 1 &&
max_streams_dev_.begin()->first == launch_stream->DeviceId()) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
// If the graph has kernels that does device side allocation, during packet capture, heap is
// allocated because heap pointer has to be added to the AQL packet, and initialized during
@@ -926,6 +1086,8 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
}
}
status = EnqueueGraphWithSingleList(launch_stream);
} else if (max_streams_ == 1 && max_streams_dev_.size() > 1) {
status = EnqueueMultiDeviceLinearGraph(launch_stream);
} else if (max_streams_ == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) {
for (int i = 0; i < topoOrder_.size(); i++) {
topoOrder_[i]->SetStream(launch_stream);
@@ -934,10 +1096,7 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
}
} else {
// Update streams for the graph execution
status = UpdateStreams(launch_stream, parallel_streams_);
if (status != hipSuccess) {
return status;
}
UpdateStreams(launch_stream);
// Execute all nodes in the graph
if (!RunNodes()) {
LogError("Failed to launch nodes!");
+30 -12
ファイルの表示
@@ -556,6 +556,7 @@ class Graph {
roots_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
leafs_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
wait_order_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
streams_dev_.reserve(g_devices.size());
}
void RemoveUserObjectFromOwingGraphs(UserObject* uObj) {
for (auto& g : uObj->owning_graphs_) {
@@ -673,12 +674,6 @@ class Graph {
//! Schedules all nodes in the graph into different streams
void ScheduleNodes();
//! Update streams for the graph execution
hipError_t UpdateStreams(
hip::Stream* launch_stream, //!< Launch stream from the application
const std::vector<hip::Stream*>& parallel_stream //!< The list of parallel streams
);
//! Runs one node on the assigned stream
bool RunOneNode(Node node, //!< Node for the execution on GPU
bool wait //!< Wait dependencies
@@ -780,6 +775,10 @@ class Graph {
protected:
int max_streams_ = 0; //!< Maximum number of streams used in the graph launch
//!< Maps stream ID to the set of device IDs that use that stream.
//!< Used to track which devices are accessed by each parallel stream
//!< during multi-device graph execution scheduling.
std::unordered_map<int, std::set<int>> streams_dev_ids_;
private:
friend class GraphExec;
@@ -802,6 +801,13 @@ class Graph {
std::unordered_set<GraphNode*> capturedNodes_;
bool graphInstantiated_;
std::unordered_map<Node, Node> clonedNodes_;
//! Map of device ID to vector of streams allocated for that device during graph execution.
//! Each device may require multiple streams to handle parallel execution of graph nodes.
std::unordered_map<int, std::vector<hip::Stream*>> streams_dev_;
//! Map tracking the maximum number of concurrent streams required per device for graph execution.
//! Key: device ID, Value: maximum number of streams needed for that device
std::unordered_map<int, int> max_streams_dev_;
};
class GraphExec : public amd::ReferenceCountedObject, public Graph {
@@ -816,13 +822,16 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
}
~GraphExec() {
for (auto stream : parallel_streams_) {
if (stream != nullptr) {
for (auto streams : parallel_streams_) {
for (auto stream : streams.second) {
if (stream != nullptr) {
stream->finish();
constexpr bool kForceDestroy = true;
hip::Stream::Destroy(stream, kForceDestroy);
constexpr bool kForceDestroy = true;
hip::Stream::Destroy(stream, kForceDestroy);
}
}
}
parallel_streams_.clear();
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (kernArgManager_ != nullptr) {
kernArgManager_->release();
@@ -856,7 +865,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
std::vector<Node>& GetNodes() { return topoOrder_; }
uint64_t GetFlags() const { return flags_; }
hipError_t Init();
hipError_t CreateStreams(uint32_t num_streams);
hipError_t CreateStreams(uint32_t num_streams, int devId = 0);
hipError_t Run(hip::Stream* stream);
// Capture GPU Packets from graph commands
hipError_t CaptureAQLPackets();
@@ -874,12 +883,21 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
hipError_t CaptureAndFormPacketsForGraph();
void GetKernelArgSizeForGraph(std::unordered_map<int, size_t>& kernArgSizeForGraph);
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
//! Enqueue a multi-device linear graph for execution
hipError_t EnqueueMultiDeviceLinearGraph(hip::Stream* hip_stream);
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
//! Update streams for the graph execution with launch stream from application
void UpdateStreams(hip::Stream* launch_stream);
//! Find the number of streams required per device for multi-device graph execution
//! This method analyzes the stream-to-device mappings and recursively processes
//! child graphs to determine the maximum concurrent streams needed per device
void FindStreamsReqPerDev();
protected:
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
std::vector<Node> topoOrder_;
std::vector<hip::Stream*> parallel_streams_;
//! parallel streams per device
std::unordered_map<int, std::vector<hip::Stream*>> parallel_streams_;
uint64_t flags_ = 0;
GraphKernelArgManager* kernArgManager_ = nullptr; //!< Kernel Arg manager for graph.
int instantiateDeviceId_ = -1;
+30 -31
ファイルの表示
@@ -1295,38 +1295,37 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
uint8_t packetType =
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s",
(*kernelNames)[packetIndex].c_str());
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s, device id : %u",
(*kernelNames)[packetIndex].c_str(), dev().index());
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER,
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
packet->setup,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
}
ClPrint(
amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
packet->setup, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
}
}
}
+145
ファイルの表示
@@ -0,0 +1,145 @@
/*
Copyright (c) 2022-25 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <functional>
#include <hip_test_helper.hh>
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_kernels.hh>
#include "graph_memset_node_test_common.hh"
#include "graph_tests_common.hh"
/**
* @addtogroup hipGraphLaunch
* @{
* @ingroup GraphTest
* `hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream)` -
* Launches an executable graph on the multi device
*/
/**
* Test Description
* ------------------------
* - Launches the single branch graph on multi device and verify the result
* ------------------------
* - catch/unit/graph//hipGraphMultiDevice.cc
* Test requirements
* ------------------------
* - Multi-device
* - HIP_VERSION >= 7.2
*/
static void check_output(int* inp, int* out, size_t size) {
for (size_t i = 0; i < size; i++) {
REQUIRE(out[i] == ((inp[i] * inp[i]) * (inp[i] * inp[i])));
}
}
static void init_input(int* a, size_t size) {
unsigned int seed = time(nullptr);
for (size_t i = 0; i < size; i++) {
a[i] = (HipTest::RAND_R(&seed) & 0xFF);
}
}
TEST_CASE("Unit_hipGraphMultiDevice") {
int nGpus = 0;
HIP_CHECK(hipGetDeviceCount(&nGpus));
if (nGpus < 2) {
fprintf(stderr, "Need at least 2 GPUs, skipped!\n");
return;
}
hipStream_t streamdev1, streamdev2;
hipEvent_t eventdev1, eventdev2;
hipGraph_t graph = nullptr;
hipGraphExec_t graph_exec = nullptr;
constexpr size_t buffer_size = (1024 * 1024);
constexpr auto blocksPerCU = 6;
constexpr int block_size = 512;
int *ibuf_h, *buf_d1, *buf_d2, *outbuf_h;
ibuf_h = new int[buffer_size];
outbuf_h = new int[buffer_size];
REQUIRE(ibuf_h != nullptr);
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipStreamCreate(&streamdev1));
HIP_CHECK(hipMalloc(&buf_d1, buffer_size * sizeof(int)));
HIP_CHECK(hipEventCreate(&eventdev1));
HIP_CHECK(hipSetDevice(1));
HIP_CHECK(hipStreamCreate(&streamdev2));
HIP_CHECK(hipMalloc(&buf_d2, buffer_size * sizeof(int)));
HIP_CHECK(hipEventCreate(&eventdev2));
HIP_CHECK(hipSetDevice(0));
init_input(ibuf_h, buffer_size);
unsigned grid_size = HipTest::setNumBlocks(blocksPerCU, block_size, buffer_size);
HIP_CHECK(hipStreamBeginCapture(streamdev1, hipStreamCaptureModeGlobal));
HIP_CHECK(
hipMemcpyAsync(buf_d1, ibuf_h, sizeof(int) * buffer_size, hipMemcpyHostToDevice, streamdev1));
HipTest::vector_square<int>
<<<grid_size, block_size, 0, streamdev1>>>(buf_d1, buf_d1, buffer_size);
HIP_CHECK(hipEventRecord(eventdev1, streamdev1));
HIP_CHECK(hipStreamWaitEvent(streamdev2, eventdev1));
HIP_CHECK(hipSetDevice(1));
HIP_CHECK(hipMemcpyDtoDAsync(buf_d2, buf_d1, sizeof(int) * buffer_size, streamdev2));
HipTest::vector_square<int>
<<<grid_size, block_size, 0, streamdev2>>>(buf_d2, buf_d2, buffer_size);
HIP_CHECK(hipEventRecord(eventdev2, streamdev2));
HIP_CHECK(hipStreamWaitEvent(streamdev1, eventdev2));
HIP_CHECK(hipStreamEndCapture(streamdev1, &graph));
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graph_exec, streamdev1));
HIP_CHECK(hipStreamSynchronize(streamdev1));
HIP_CHECK(hipSetDevice(1));
HIP_CHECK(hipMemcpy(outbuf_h, buf_d2, sizeof(int) * buffer_size, hipMemcpyHostToDevice));
check_output(ibuf_h, outbuf_h, buffer_size);
HIP_CHECK(hipGraphExecDestroy(graph_exec));
HIP_CHECK(hipGraphDestroy(graph));
delete[] ibuf_h;
delete[] outbuf_h;
HIP_CHECK(hipFree(buf_d1));
HIP_CHECK(hipFree(buf_d2));
HIP_CHECK(hipStreamDestroy(streamdev1));
HIP_CHECK(hipStreamDestroy(streamdev2));
HIP_CHECK(hipEventDestroy(eventdev1));
HIP_CHECK(hipEventDestroy(eventdev2));
}
/**
* End doxygen group GraphMultiDevice.
* @}
*/