Profiler - Add partition support

NOTE: GPU ordering used is not the same as in HSA/HIP. GPUs are ordered via amdsmi and then GPU_ID fields are compared to map GPU partitions to each other. Change-Id: If379214f5281d7d5ee98515b3e5ba7affc2e2197 Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
2025-05-21 18:40:15 -05:00
@@ -45,6 +45,7 @@ FLD_DESC_ENT(RDC_FI_REV_ID,              "Revision ID of the device",
 FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device",               "GFX",              true)
 FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units",                    "COMPUTE_UNITS",    true)
 FLD_DESC_ENT(RDC_FI_UUID,                "Unique ID of the device AKA asic_serial",     "UUID",             true)
+FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count",                         "PARTITION_COUNT",  true)

 FLD_DESC_ENT(RDC_FI_GPU_CLOCK,           "Current GPU clock frequencies",               "GPU_CLOCK",        true)
 FLD_DESC_ENT(RDC_FI_MEM_CLOCK,           "Current Memory clock frequencies",            "MEM_CLOCK",        true)
@@ -136,25 +137,25 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB,    "XGMI accumlated data write size acr
 // This doesn't map to rocprofiler counters directly
 // See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
 // See metrics.xml in rocprofiler
-FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT,     "Percent of GPU occupancy",              "OCCUPANCY_PERCENT", false)
-FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES,         "Number of Active Cycles",               "ACTIVE_CYCLES",     false)
-FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES,          "Number of Active Waves",                "ACTIVE_WAVES",      false)
-FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES,        "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES",    false)
-FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors",        "TENSOR_PERCENT",    false)
-FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT,      "Percent of GPU Utilization",            "GPU_UTIL_PERCENT",  false)
+FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT,     "Percent of GPU occupancy",              "OCCUPANCY_PERCENT", true)
+FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES,         "Number of Active Cycles",               "ACTIVE_CYCLES",     true)
+FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES,          "Number of Active Waves",                "ACTIVE_WAVES",      true)
+FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES,        "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES",    true)
+FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors",        "TENSOR_PERCENT",    true)
+FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT,      "Percent of GPU Utilization",            "GPU_UTIL_PERCENT",  true)
 // metrics with EVAL are divided by time passed
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW,         "Fetched from video memory kb / ms",     "MEM_R_BW",          false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW,         "Written to video memory kb / ms",       "MEM_W_BW",          false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16,         "Number of fp16 OPS / ms",               "FLOPS_16",          false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32,         "Number of fp32 OPS / ms",               "FLOPS_32",          false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64,         "Number of fp64 OPS / ms",               "FLOPS_64",          false)
-FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL,  "Percent of Active Pipe VALU",           "VALU_UTILIZATION",  false)
-FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE,             "Ratio of Cycles with active warp on SM","VALUBusy",          false)
-FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU,     "Mean occ per active compute unit",      "OCC_CU",            false)
-FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED,           "Mean occ per active cu over elapsed",   "OCC_CU_ELAPSED",    false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max",     "FLOPS_16_PERCENT",  false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max",     "FLOPS_32_PERCENT",  false)
-FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max",     "FLOPS_64_PERCENT",  false)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW,         "Fetched from video memory kb / ms",     "MEM_R_BW",          true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW,         "Written to video memory kb / ms",       "MEM_W_BW",          true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16,         "Number of fp16 OPS / ms",               "FLOPS_16",          true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32,         "Number of fp32 OPS / ms",               "FLOPS_32",          true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64,         "Number of fp64 OPS / ms",               "FLOPS_64",          true)
+FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL,  "Percent of Active Pipe VALU",           "VALU_UTILIZATION",  true)
+FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE,             "Ratio of Cycles with active warp on SM","VALUBusy",          true)
+FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU,     "Mean occ per active compute unit",      "OCC_CU",            true)
+FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED,           "Mean occ per active cu over elapsed",   "OCC_CU_ELAPSED",    true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max",     "FLOPS_16_PERCENT",  true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max",     "FLOPS_32_PERCENT",  true)
+FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max",     "FLOPS_64_PERCENT",  true)
 // CPC
 FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY,                  "", "CPC_CPC_STAT_BUSY",                  false)
 FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE,                  "", "CPC_CPC_STAT_IDLE",                  false)
@@ -194,7 +195,8 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE,                  "", "CPF_CPF_TCIU_I
 FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL,                 "", "CPF_CPF_TCIU_STALL",                 false)
 // Misc
 FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
-FLD_DESC_ENT(RDC_FI_PROF_UUID,             "UUID from rocprofiler",                         "PROF_UUID",        false)
+FLD_DESC_ENT(RDC_FI_PROF_UUID,             "UUID from rocprofiler",                         "PROF_UUID",        true)
+FLD_DESC_ENT(RDC_FI_PROF_KFD_ID,           "GPU_ID from rocprofiler, same as KFD_ID",       "PROF_KFD_ID",      true)

 // Events
 FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX,     "NOPs sent to neighbor 0",                     "XGMI_NOP_0",       false)
@@ -173,6 +173,7 @@ typedef enum {
  RDC_FI_TARGET_GRAPHICS_VERSION,  //!< Target graphics version
  RDC_FI_NUM_OF_COMPUTE_UNITS,     //!< Number of compute units
  RDC_FI_UUID,                     //!< Device UUID
+  RDC_FI_GPU_PARTITION_COUNT,

  /**
   * @brief Frequency related fields
@@ -344,6 +345,7 @@ typedef enum {
  RDC_FI_PROF_CPF_CPF_TCIU_STALL,
  RDC_FI_PROF_SIMD_UTILIZATION,
  RDC_FI_PROF_UUID,
+  RDC_FI_PROF_KFD_ID,

  /**
   * @brief Raw XGMI counter events
@@ -69,18 +69,22 @@ class RdcRocpBase {
   */
  static const uint32_t collection_duration_us_k = 10000;

-  double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index);
-
  /**
   * @brief By default all profiler values are read as doubles
   */
-  double run_profiler(uint32_t gpu_index, rdc_field_t field);
-  void map_smi_to_profiler_by_uuid();
+  double run_profiler(uint32_t agent_index, rdc_field_t field);
+
+  /**
+   * @description Create a map from entity_id to profiler agent_index.
+   * This is required due to different structure and ordering.
+   * Populates entity_to_prof_map.
+   */
+  rdc_status_t map_entity_to_profiler();

  std::vector<rocprofiler_agent_v0_t> agents = {};
  std::vector<std::shared_ptr<CounterSampler>> samplers = {};
  std::map<rdc_field_t, const char*> field_to_metric = {};
-  std::map<uint32_t, uint32_t> smi_to_profiler_map = {};
+  std::map<uint32_t, uint32_t> entity_to_prof_map = {};

  // these fields must be divided by time passed
  std::unordered_set<rdc_field_t> eval_fields = {
@@ -178,6 +178,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_
  if (!count) {
    return RDC_ST_BAD_PARAMETER;
  }
+
  rdc_field_value device_count;
  rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
  if (status != RDC_ST_OK) {
@@ -26,6 +26,8 @@ THE SOFTWARE.
 #include <sys/time.h>

 #include <chrono>  //NOLINT
+#include <cstddef>
+#include <cstdint>
 #include <set>
 #include <vector>

@@ -86,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
 }

 uint64_t RdcMetricFetcherImpl::now() {
-  struct timeval tv;
+  struct timeval tv {};
  gettimeofday(&tv, NULL);
  return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
 }
@@ -98,6 +100,7 @@ void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id,

  amdsmi_processor_handle processor_handle;
  err = get_processor_handle_from_id(gpu_index, &processor_handle);
+  assert(err == AMDSMI_STATUS_SUCCESS);

  // because RDC already had an established order that is different from amd-smi : map blocks to
  // fields manually
@@ -521,9 +524,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
      case RDC_FI_GPU_CLOCK: {
        const uint16_t* clock_array = gpu_metrics.current_gfxclks;
        std::vector<uint16_t> valid_clocks;
-        valid_clocks.reserve(8);
+        valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS);

-        for (uint32_t i = 0; i < 8; i++) {
+        for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) {
          uint16_t clk = clock_array[i];
          if (clk != 0 && clk != 0xFFFF) {
            valid_clocks.push_back(clk);
@@ -540,7 +543,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
        }

        if (vc == num_partitions) {
-          value->value.l_int = clock_array[info.instance_index] * 1000000;
+          value->value.l_int = static_cast<int64_t>(clock_array[info.instance_index]) * 1000000;
          value->type = INTEGER;
          value->status = RDC_ST_OK;
          return RDC_ST_OK;
@@ -620,10 +623,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
      }

      default:
-        // All other fields => N/A for partition
-        RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
-                                        << " not supported => NO_DATA.");
-        return RDC_ST_NO_DATA;
+        // for now we must let other plugins return valid data for partition metrics
+
+        // TODO: All other fields => N/A for partition IN AMDSMI
+        // RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
+        //                                 << " not supported => NO_DATA.");
+        break;
    }
  }  // end if partition

@@ -748,6 +753,17 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
        value->value.l_int = static_cast<int64_t>(socket_count);
      }
    } break;
+    case RDC_FI_GPU_PARTITION_COUNT: {
+      uint32_t partition_count = 0;
+      amdsmi_gpu_metrics_t metrics;
+      memset(&metrics, 0, sizeof(metrics));
+      value->status = get_metrics_info(processor_handle, &metrics);
+      partition_count = metrics.num_partition;
+      value->type = INTEGER;
+      if (value->status == AMDSMI_STATUS_SUCCESS) {
+        value->value.l_int = static_cast<int64_t>(partition_count);
+      }
+    } break;
    case RDC_FI_POWER_USAGE: {
      amdsmi_power_info_t power_info = {};
 // Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0
@@ -186,7 +186,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
      RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
      RDC_FI_GPU_BUSY_PERCENT,         RDC_FI_GPU_PAGE_RETRIED,
      RDC_FI_DEV_ID,                   RDC_FI_REV_ID,                 RDC_FI_TARGET_GRAPHICS_VERSION,
-      RDC_FI_NUM_OF_COMPUTE_UNITS,     RDC_FI_UUID,
+      RDC_FI_NUM_OF_COMPUTE_UNITS,     RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
  };
  // clang-format on
  std::copy(fields.begin(), fields.end(), field_ids);
@@ -228,42 +228,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
    return result;
  }

-  // Check for rocprof fields in partitions
-  rdc_group_info_t ginfo;
-  result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
-  if (result != RDC_ST_OK) {
-    return result;
-  }
-  bool groupHasPartition = false;
-  for (unsigned int i = 0; i < ginfo.count; i++) {
-    uint32_t entityId = ginfo.entity_ids[i];
-    rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
-    if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
-      groupHasPartition = true;
-      break;
-    }
-  }
-
-  rdc_field_group_info_t field_info;
-  result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
-  if (result != RDC_ST_OK) {
-    return result;
-  }
-  bool groupHasRocprof = false;
-  if (result == RDC_ST_OK) {
-    for (unsigned int i = 0; i < field_info.count; i++) {
-      rdc_field_t fid = field_info.field_ids[i];
-      if (fid >= 800 && fid < 900) {  // Rocprof fields in the 800's
-        groupHasRocprof = true;
-        break;
-      }
-    }
-  }
-
-  if (groupHasPartition && groupHasRocprof) {
-    return RDC_ST_NOT_SUPPORTED;
-  }
-
  // See if any of the fields are notification fields, and
  // set them up, if so.
  result = notifications_->set_listen_events(fields_in_watch);
@@ -240,6 +240,10 @@ amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
    return ret;
  }

+  if (num_partition == nullptr) {
+    return AMDSMI_STATUS_INVAL;
+  }
+
  amdsmi_gpu_metrics_t metrics;
  memset(&metrics, 0, sizeof(metrics));
  ret = get_metrics_info(proc_handle, &metrics);
@@ -51,12 +51,12 @@ THE SOFTWARE.
 namespace amd {
 namespace rdc {

-double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
+double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
  thread_local std::vector<rocprofiler_record_counter_t> records;

-  auto counter_sampler = CounterSampler::get_samplers()[gpu_index];
+  auto counter_sampler = CounterSampler::get_samplers()[agent_index];
  if (!counter_sampler) {
-    RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index);
+    RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << agent_index);
    return RDC_ST_BAD_PARAMETER;
  }

@@ -116,53 +116,88 @@ std::string uuid_to_string(const uint64_t uuid) {

 std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); }

-void RdcRocpBase::map_smi_to_profiler_by_uuid() {
-  std::map<uint32_t, rocprofiler_uuid_t> index_to_prof_map;
-  std::map<uint32_t, rocprofiler_uuid_t> index_to_smi_map;
+rdc_status_t RdcRocpBase::map_entity_to_profiler() {
+  // std::map<uint32_t, uint32_t> entity_to_index_map;
+  // kfd_id_t is only used inside this function
+  typedef uint64_t kfd_id_t;
+  std::map<uint32_t, kfd_id_t> prof_kfd_map;

-  // find intersection of supported and requested fields
-  for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
-    index_to_prof_map.insert({gpu_index, agents[gpu_index].uuid});
-
-    amdsmi_processor_handle processor_handle = nullptr;
-    auto amdsmi_status = get_processor_handle_from_id(gpu_index, &processor_handle);
-    if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
-      continue;
-    }
-    amdsmi_asic_info_t asic_info;
-    amdsmi_status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
-    if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
-      continue;
-    }
-    rocprofiler_uuid_t temp_id = asic_serial_to_uuid(asic_info.asic_serial);
-    index_to_smi_map.insert({gpu_index, temp_id});
-
-    // clang-format off
-    RDC_LOG(RDC_DEBUG, "\n"
-        "ID[" << gpu_index << "]:\n"
-        "  PROF: " << uuid_to_string(index_to_prof_map[gpu_index]) << "\n"
-        "  SMI:  " << uuid_to_string(index_to_smi_map[gpu_index]));
-    // clang-format on
+  // populate profiler map
+  for (uint32_t prof_gpu_index = 0; prof_gpu_index < agents.size(); prof_gpu_index++) {
+    prof_kfd_map.insert({prof_gpu_index, agents[prof_gpu_index].gpu_id});
  }

-  // Create a mapping from SMI to ROCProfiler by comparing uuid
-  for (const auto& [smi_index, smi_uuid] : index_to_smi_map) {
-    for (const auto& [prof_index, prof_uuid] : index_to_prof_map) {
-      if (std::memcmp(&smi_uuid, &prof_uuid, sizeof(rocprofiler_uuid_t)) == 0) {
-        // match found
-        smi_to_profiler_map[smi_index] = prof_index;
-        break;
+  std::vector<amdsmi_socket_handle> sockets;
+  auto amdsmi_status = get_socket_handles(sockets);
+  if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
+    RDC_LOG(RDC_ERROR, "Failed to get socket handles: " << amdsmi_status);
+    return Smi2RdcError(amdsmi_status);
+  }
+
+  for (int socket_index = 0; socket_index < sockets.size(); socket_index++) {
+    auto* socket = sockets[socket_index];
+    std::vector<amdsmi_processor_handle> processors;
+    amdsmi_status = get_processor_handles(socket, processors);
+    if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
+      RDC_LOG(RDC_ERROR, "Failed to get processor handles for socket " << socket_index << ": "
+                                                                       << amdsmi_status);
+      return Smi2RdcError(amdsmi_status);
+    }
+
+    for (int processor_index = 0; processor_index < processors.size(); processor_index++) {
+      auto* processor = processors[processor_index];
+      processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
+      amdsmi_status = amdsmi_get_processor_type(processor, &processor_type);
+      if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
+        RDC_LOG(RDC_ERROR, "Failed to get processor type for processor "
+                               << processor_index << " on socket " << socket_index << ": "
+                               << amdsmi_status);
+        return Smi2RdcError(amdsmi_status);
+      }
+      if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
+        continue;
+      }
+
+      amdsmi_kfd_info_t kfd_info;
+      amdsmi_status = amdsmi_get_gpu_kfd_info(processor, &kfd_info);
+      if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
+        RDC_LOG(RDC_ERROR, "Failed to get KFD info for processor "
+                               << processor_index << " on socket " << socket_index << ": "
+                               << amdsmi_status);
+        return Smi2RdcError(amdsmi_status);
+      }
+
+      rdc_entity_info_t entity_info = {
+          .device_index = static_cast<uint32_t>(socket_index),
+          .instance_index = static_cast<uint32_t>(processor_index),
+          .entity_role = RDC_DEVICE_ROLE_PHYSICAL,
+          .device_type = RDC_DEVICE_TYPE_GPU,
+      };
+
+      uint32_t entity_index = rdc_get_entity_index_from_info(entity_info);
+
+      for (const auto& [prof_index, prof_id] : prof_kfd_map) {
+        if (std::memcmp(&kfd_info.kfd_id, &prof_id, sizeof(kfd_id_t)) == 0) {
+          // match found
+          // clang-format off
+          RDC_LOG(RDC_DEBUG, "SMI[" << entity_index << "] <-> Profiler[" << prof_index << "] = KFD_ID[" << prof_id << "]");
+          // clang-format on
+          if (entity_info.entity_role == RDC_DEVICE_ROLE_PHYSICAL) {
+            entity_index = rdc_get_entity_index_from_info(entity_info);
+            entity_to_prof_map.insert({entity_index, prof_index});
+          }
+          if (processors.size() > 1) {
+            // if there are multiple processors, also add entity with partition instance type
+            entity_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
+            entity_index = rdc_get_entity_index_from_info(entity_info);
+            entity_to_prof_map.insert({entity_index, prof_index});
+          }
+          break;
+        }
      }
    }
  }
-
-  for (const auto& [smi_index, prof_index] : smi_to_profiler_map) {
-    const auto& prof_uuid = index_to_prof_map[prof_index];
-    const auto& smi_uuid = index_to_smi_map[smi_index];
-    RDC_LOG(RDC_DEBUG, "SMI index " << smi_index << " maps to ROCProfiler index " << prof_index
-                                    << " with UUID: " << uuid_to_string(prof_uuid) << " = "
-                                    << uuid_to_string(smi_uuid));
-  }
+  return RDC_ST_OK;
 }

 RdcRocpBase::RdcRocpBase() {
@@ -226,7 +261,8 @@ RdcRocpBase::RdcRocpBase() {
      {RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
      {RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
      {RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
-      {RDC_FI_PROF_UUID, "SQ_WAVES"},  // dummy value,
+      {RDC_FI_PROF_UUID, "SQ_WAVES"},    // dummy value,
+      {RDC_FI_PROF_KFD_ID, "SQ_WAVES"},  // dummy value,
  };

  hsa_status_t status = hsa_init();
@@ -251,23 +287,22 @@ RdcRocpBase::RdcRocpBase() {
  RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size());
  samplers = CounterSampler::get_samplers();

-  map_smi_to_profiler_by_uuid();
+  map_entity_to_profiler();

  // find intersection of supported and requested fields
-  for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
-    auto& cs = *samplers[gpu_index];
-    RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id
-                                    << "] agent_id[" << agents[gpu_index].id.handle << "]");
+  uint32_t agent_index = 0;
+  auto& cs = *samplers[agent_index];
+  RDC_LOG(RDC_DEBUG, "agent_index[" << agent_index << "] location_id["
+                                    << agents[agent_index].location_id << "]");

-    for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
-      checked_fields.emplace_back(str);
-    }
+  for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
+    checked_fields.emplace_back(str);
+  }

-    for (const auto& [k, v] : temp_field_map_k) {
-      auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
-      if (found != checked_fields.end()) {
-        field_to_metric.insert({k, v});
-      }
+  for (const auto& [k, v] : temp_field_map_k) {
+    auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
+    if (found != checked_fields.end()) {
+      field_to_metric.insert({k, v});
    }
  }

@@ -276,7 +311,7 @@ RdcRocpBase::RdcRocpBase() {
    all_fields.emplace_back(v);
  }

-  RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields");
+  RDC_LOG(RDC_DEBUG, "Profiler supports " << field_to_metric.size() << " fields");
 }

 RdcRocpBase::~RdcRocpBase() {
@@ -292,7 +327,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
  // default type
  *type = DOUBLE;

-  const auto& gpu_index = smi_to_profiler_map[gpu_field.gpu_index];
+  // convert from entity to flat index
+  uint32_t agent_index = entity_to_prof_map[gpu_field.gpu_index];
  const auto& field = gpu_field.field_id;

  if (data == nullptr) {
@@ -303,7 +339,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value

  const auto start_time = std::chrono::high_resolution_clock::now();
  // direct read from rocprofiler
-  const double read_dbl = run_profiler(gpu_index, field);
+  const double read_dbl = run_profiler(agent_index, field);
  const auto stop_time = std::chrono::high_resolution_clock::now();
  const double elapsed = std::chrono::duration<double, std::milli>(stop_time - start_time).count();
  // divide by elapsed time if needed
@@ -330,8 +366,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
      // function
      const double active_cycles_val = read_dbl;
      if (active_cycles_val != 0.0) {
-        // read second value from rocprofiler
-        const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
+        // read second value from profiler
+        const double occupancy_val = run_profiler(agent_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
        data->dbl = occupancy_val / active_cycles_val;
      } else {
        return RDC_ST_BAD_PARAMETER;
@@ -343,14 +379,14 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
        return RDC_ST_BAD_PARAMETER;
      }
      // 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper
-      const std::string target_version = agents[gpu_index].name;
+      const std::string target_version = agents[agent_index].name;
      // TODO: Design a lookup table for other GPUs
      const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
      // FLOPS/clock/CU
      if (isMI200) {
-        data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
+        data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
      } else {  // Assume mi300
-        data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
+        data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
      }
    } break;
    case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
@@ -360,15 +396,21 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
        return RDC_ST_BAD_PARAMETER;
      }
      // FLOPS/clock/CU
-      data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
+      data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
      break;
    case RDC_FI_PROF_UUID: {
      // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
      *type = STRING;
-      std::string uuid_str = uuid_to_string(agents[gpu_index].uuid);
+      std::string uuid_str = uuid_to_string(agents[agent_index].uuid);
      strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length());
      break;
    }
+    case RDC_FI_PROF_KFD_ID: {
+      // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
+      *type = INTEGER;
+      data->l_int = agents[agent_index].gpu_id;
+      break;
+    }
    default:
      // only support default fallback for doubles
      assert(*type == DOUBLE);