From 799f0323cdaf5d020c9e8f8b24b2cd7e5571539f Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Tue, 18 Oct 2022 20:06:44 -0700 Subject: [PATCH] Report HSA_OPS activities using the ROCr driver_node_id instead of the device's index When multiple ranks are used, each rank's first logical device always has GPU ID 0, regardless of which physical device is selected with CUDA_VISIBLE_DEVICES. Because of this, when merging trace files from multiple ranks, GPU IDs from different processes may overlap. The long term solution is to use the KFD's gpu_id which is stable across APIs and processes. Unfortunately the gpu_id is not yet exposed by the ROCr, so for now use the driver's node id. Change-Id: I2f5af8d2a7e8a89efeb5e0a1b86bdfa547b25fc8 --- src/roctracer/hsa_support.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/roctracer/hsa_support.cpp b/src/roctracer/hsa_support.cpp index d6bfa7b4a0..31153987d3 100644 --- a/src/roctracer/hsa_support.cpp +++ b/src/roctracer/hsa_support.cpp @@ -61,7 +61,7 @@ AmdExtTable saved_amd_ext_api{}; hsa_ven_amd_loader_1_01_pfn_t hsa_loader_api{}; struct AgentInfo { - int index; + uint32_t id; hsa_device_type_t type; }; std::unordered_map agent_info_map; @@ -275,7 +275,7 @@ hsa_status_t MemoryPoolAllocateIntercept(hsa_amd_memory_pool_t pool, size_t size hsa_evt_data_t data{}; data.device.type = it->second.type; - data.device.id = it->second.index; + data.device.id = it->second.id; data.device.agent = agent; data.device.ptr = ptr; @@ -314,7 +314,7 @@ hsa_status_t AgentsAllowAccessIntercept(uint32_t num_agents, const hsa_agent_t* hsa_evt_data_t data{}; data.device.type = it->second.type; - data.device.id = it->second.index; + data.device.id = it->second.id; data.device.agent = agent; data.device.ptr = ptr; @@ -540,15 +540,20 @@ void Initialize(HsaApiTable* table) { switch (agent_info.type) { case HSA_DEVICE_TYPE_CPU: static int cpu_agent_count = 0; - agent_info.index = cpu_agent_count++; - break; - case HSA_DEVICE_TYPE_GPU: - static int gpu_agent_count = 0; - agent_info.index = gpu_agent_count++; + agent_info.id = cpu_agent_count++; break; + case HSA_DEVICE_TYPE_GPU: { + uint32_t driver_node_id; + if (hsa_support::saved_core_api.hsa_agent_get_info_fn( + agent, static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &driver_node_id) != HSA_STATUS_SUCCESS) + fatal("hsa_agent_get_info failed"); + + agent_info.id = driver_node_id; + } break; default: static int other_agent_count = 0; - agent_info.index = other_agent_count++; + agent_info.id = other_agent_count++; break; } hsa_support::agent_info_map.emplace(agent.handle, agent_info);