Report HSA_OPS activities using the ROCr driver_node_id instead of the device's index
When multiple ranks are used, each rank's first logical device always has GPU ID 0, regardless of which physical device is selected with CUDA_VISIBLE_DEVICES. Because of this, when merging trace files from multiple ranks, GPU IDs from different processes may overlap. The long term solution is to use the KFD's gpu_id which is stable across APIs and processes. Unfortunately the gpu_id is not yet exposed by the ROCr, so for now use the driver's node id. Change-Id: I2f5af8d2a7e8a89efeb5e0a1b86bdfa547b25fc8
This commit is contained in:
@@ -61,7 +61,7 @@ AmdExtTable saved_amd_ext_api{};
|
||||
hsa_ven_amd_loader_1_01_pfn_t hsa_loader_api{};
|
||||
|
||||
struct AgentInfo {
|
||||
int index;
|
||||
uint32_t id;
|
||||
hsa_device_type_t type;
|
||||
};
|
||||
std::unordered_map<decltype(hsa_agent_t::handle), AgentInfo> agent_info_map;
|
||||
@@ -275,7 +275,7 @@ hsa_status_t MemoryPoolAllocateIntercept(hsa_amd_memory_pool_t pool, size_t size
|
||||
|
||||
hsa_evt_data_t data{};
|
||||
data.device.type = it->second.type;
|
||||
data.device.id = it->second.index;
|
||||
data.device.id = it->second.id;
|
||||
data.device.agent = agent;
|
||||
data.device.ptr = ptr;
|
||||
|
||||
@@ -314,7 +314,7 @@ hsa_status_t AgentsAllowAccessIntercept(uint32_t num_agents, const hsa_agent_t*
|
||||
|
||||
hsa_evt_data_t data{};
|
||||
data.device.type = it->second.type;
|
||||
data.device.id = it->second.index;
|
||||
data.device.id = it->second.id;
|
||||
data.device.agent = agent;
|
||||
data.device.ptr = ptr;
|
||||
|
||||
@@ -540,15 +540,20 @@ void Initialize(HsaApiTable* table) {
|
||||
switch (agent_info.type) {
|
||||
case HSA_DEVICE_TYPE_CPU:
|
||||
static int cpu_agent_count = 0;
|
||||
agent_info.index = cpu_agent_count++;
|
||||
break;
|
||||
case HSA_DEVICE_TYPE_GPU:
|
||||
static int gpu_agent_count = 0;
|
||||
agent_info.index = gpu_agent_count++;
|
||||
agent_info.id = cpu_agent_count++;
|
||||
break;
|
||||
case HSA_DEVICE_TYPE_GPU: {
|
||||
uint32_t driver_node_id;
|
||||
if (hsa_support::saved_core_api.hsa_agent_get_info_fn(
|
||||
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID),
|
||||
&driver_node_id) != HSA_STATUS_SUCCESS)
|
||||
fatal("hsa_agent_get_info failed");
|
||||
|
||||
agent_info.id = driver_node_id;
|
||||
} break;
|
||||
default:
|
||||
static int other_agent_count = 0;
|
||||
agent_info.index = other_agent_count++;
|
||||
agent_info.id = other_agent_count++;
|
||||
break;
|
||||
}
|
||||
hsa_support::agent_info_map.emplace(agent.handle, agent_info);
|
||||
|
||||
Reference in New Issue
Block a user