From 860e33dbed8a2e3e0414a2e15cd88ccc5bdbf578 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Tue, 18 Oct 2022 19:51:02 -0700 Subject: [PATCH] SWDEV-362046 - Report HIP_OPS activities using the ROCr driver_node_id instead of the device's index The ROCclr assigns zero-based IDs to GPUs in the order they are discovered. That zero-based ID is what is used to identify the GPU on which the HIP_OPS activity took place. When multiple ranks are used, each rank's first logical device always has GPU ID 0, regardless of which physical device is selected with CUDA_VISIBLE_DEVICES. Because of this, when merging trace files from multiple ranks, GPU IDs from different processes may overlap. The long term solution is to use the KFD's gpu_id which is stable across APIs and processes. Unfortunately the gpu_id is not yet exposed by the ROCr, so for now use the driver's node id. Change-Id: Ib78854527d600d175bb76e2df0747c33f898c615 [ROCm/clr commit: 9a82118c85a69c25f5202dee84d59e5f306c1681] --- projects/clr/rocclr/device/device.hpp | 2 ++ projects/clr/rocclr/device/rocm/rocdevice.cpp | 7 +++++++ projects/clr/rocclr/platform/activity.cpp | 4 ++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 8d20b3ad17..1d03f90afb 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -613,6 +613,8 @@ struct Info : public amd::EmbeddedObject { bool virtualMemoryManagement_; //!< Virtual memory management support size_t virtualMemAllocGranularity_; //!< virtual memory allocation size/addr granularity + + uint32_t driverNodeId_; }; //! Device settings diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 9dea3d444d..49616b8523 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -1179,6 +1179,13 @@ bool Device::populateOCLDeviceConstants() { } assert(info_.globalMemChannels_ > 0); + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(bkendDevice_, + static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &info_.driverNodeId_)) { + return false; + } + setupCpuAgent(); checkAtomicSupport(); diff --git a/projects/clr/rocclr/platform/activity.cpp b/projects/clr/rocclr/platform/activity.cpp index d5678b4fce..2e6d61962e 100644 --- a/projects/clr/rocclr/platform/activity.cpp +++ b/projects/clr/rocclr/platform/activity.cpp @@ -73,8 +73,8 @@ void ReportActivity(const amd::Command& command) { command.profilingInfo().start_, // begin timestamp, ns command.profilingInfo().end_, // end timestamp, ns {{ - static_cast(queue->device().index()), // device id - queue->vdev()->index() // queue id + static_cast(queue->device().info().driverNodeId_), // device id + queue->vdev()->index() // queue id }}, {} // copied data size for memcpy, or kernel name for dispatch };