Profiler - Add partition support

NOTE: GPU ordering used is not the same as in HSA/HIP.

GPUs are ordered via amdsmi and then GPU_ID fields are compared to map
GPU partitions to each other.

Change-Id: If379214f5281d7d5ee98515b3e5ba7affc2e2197
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
このコミットが含まれているのは:
Galantsev, Dmitrii
2025-05-21 18:40:15 -05:00
committed by Galantsev, Dmitrii
コミット 85b619b2f0
9個のファイルの変更172行の追加137行の削除
+21 -19
ファイルの表示
@@ -45,6 +45,7 @@ FLD_DESC_ENT(RDC_FI_REV_ID, "Revision ID of the device",
FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", "GFX", true)
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true)
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
@@ -136,25 +137,25 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", true)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", true)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", true)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", true)
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", true)
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", true)
// metrics with EVAL are divided by time passed
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false)
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", true)
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", true)
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", true)
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", true)
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", true)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", true)
// CPC
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY, "", "CPC_CPC_STAT_BUSY", false)
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE, "", "CPC_CPC_STAT_IDLE", false)
@@ -194,7 +195,8 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
// Misc
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", false)
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", true)
FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
+2
ファイルの表示
@@ -173,6 +173,7 @@ typedef enum {
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
RDC_FI_UUID, //!< Device UUID
RDC_FI_GPU_PARTITION_COUNT,
/**
* @brief Frequency related fields
@@ -344,6 +345,7 @@ typedef enum {
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
RDC_FI_PROF_SIMD_UTILIZATION,
RDC_FI_PROF_UUID,
RDC_FI_PROF_KFD_ID,
/**
* @brief Raw XGMI counter events
+9 -5
ファイルの表示
@@ -69,18 +69,22 @@ class RdcRocpBase {
*/
static const uint32_t collection_duration_us_k = 10000;
double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index);
/**
* @brief By default all profiler values are read as doubles
*/
double run_profiler(uint32_t gpu_index, rdc_field_t field);
void map_smi_to_profiler_by_uuid();
double run_profiler(uint32_t agent_index, rdc_field_t field);
/**
* @description Create a map from entity_id to profiler agent_index.
* This is required due to different structure and ordering.
* Populates entity_to_prof_map.
*/
rdc_status_t map_entity_to_profiler();
std::vector<rocprofiler_agent_v0_t> agents = {};
std::vector<std::shared_ptr<CounterSampler>> samplers = {};
std::map<rdc_field_t, const char*> field_to_metric = {};
std::map<uint32_t, uint32_t> smi_to_profiler_map = {};
std::map<uint32_t, uint32_t> entity_to_prof_map = {};
// these fields must be divided by time passed
std::unordered_set<rdc_field_t> eval_fields = {
+1
ファイルの表示
@@ -178,6 +178,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_count;
rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
if (status != RDC_ST_OK) {
+24 -8
ファイルの表示
@@ -26,6 +26,8 @@ THE SOFTWARE.
#include <sys/time.h>
#include <chrono> //NOLINT
#include <cstddef>
#include <cstdint>
#include <set>
#include <vector>
@@ -86,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
}
uint64_t RdcMetricFetcherImpl::now() {
struct timeval tv;
struct timeval tv {};
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
}
@@ -98,6 +100,7 @@ void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id,
amdsmi_processor_handle processor_handle;
err = get_processor_handle_from_id(gpu_index, &processor_handle);
assert(err == AMDSMI_STATUS_SUCCESS);
// because RDC already had an established order that is different from amd-smi : map blocks to
// fields manually
@@ -521,9 +524,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
case RDC_FI_GPU_CLOCK: {
const uint16_t* clock_array = gpu_metrics.current_gfxclks;
std::vector<uint16_t> valid_clocks;
valid_clocks.reserve(8);
valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS);
for (uint32_t i = 0; i < 8; i++) {
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) {
uint16_t clk = clock_array[i];
if (clk != 0 && clk != 0xFFFF) {
valid_clocks.push_back(clk);
@@ -540,7 +543,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
if (vc == num_partitions) {
value->value.l_int = clock_array[info.instance_index] * 1000000;
value->value.l_int = static_cast<int64_t>(clock_array[info.instance_index]) * 1000000;
value->type = INTEGER;
value->status = RDC_ST_OK;
return RDC_ST_OK;
@@ -620,10 +623,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
default:
// All other fields => N/A for partition
RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
<< " not supported => NO_DATA.");
return RDC_ST_NO_DATA;
// for now we must let other plugins return valid data for partition metrics
// TODO: All other fields => N/A for partition IN AMDSMI
// RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
// << " not supported => NO_DATA.");
break;
}
} // end if partition
@@ -748,6 +753,17 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
value->value.l_int = static_cast<int64_t>(socket_count);
}
} break;
case RDC_FI_GPU_PARTITION_COUNT: {
uint32_t partition_count = 0;
amdsmi_gpu_metrics_t metrics;
memset(&metrics, 0, sizeof(metrics));
value->status = get_metrics_info(processor_handle, &metrics);
partition_count = metrics.num_partition;
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(partition_count);
}
} break;
case RDC_FI_POWER_USAGE: {
amdsmi_power_info_t power_info = {};
// Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0
+1 -1
ファイルの表示
@@ -186,7 +186,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID,
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
};
// clang-format on
std::copy(fields.begin(), fields.end(), field_ids);
-36
ファイルの表示
@@ -228,42 +228,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
return result;
}
// Check for rocprof fields in partitions
rdc_group_info_t ginfo;
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
bool groupHasPartition = false;
for (unsigned int i = 0; i < ginfo.count; i++) {
uint32_t entityId = ginfo.entity_ids[i];
rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
groupHasPartition = true;
break;
}
}
rdc_field_group_info_t field_info;
result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
if (result != RDC_ST_OK) {
return result;
}
bool groupHasRocprof = false;
if (result == RDC_ST_OK) {
for (unsigned int i = 0; i < field_info.count; i++) {
rdc_field_t fid = field_info.field_ids[i];
if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's
groupHasRocprof = true;
break;
}
}
}
if (groupHasPartition && groupHasRocprof) {
return RDC_ST_NOT_SUPPORTED;
}
// See if any of the fields are notification fields, and
// set them up, if so.
result = notifications_->set_listen_events(fields_in_watch);
+4
ファイルの表示
@@ -240,6 +240,10 @@ amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
return ret;
}
if (num_partition == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_gpu_metrics_t metrics;
memset(&metrics, 0, sizeof(metrics));
ret = get_metrics_info(proc_handle, &metrics);
+110 -68
ファイルの表示
@@ -51,12 +51,12 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
thread_local std::vector<rocprofiler_record_counter_t> records;
auto counter_sampler = CounterSampler::get_samplers()[gpu_index];
auto counter_sampler = CounterSampler::get_samplers()[agent_index];
if (!counter_sampler) {
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index);
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << agent_index);
return RDC_ST_BAD_PARAMETER;
}
@@ -116,53 +116,88 @@ std::string uuid_to_string(const uint64_t uuid) {
std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); }
void RdcRocpBase::map_smi_to_profiler_by_uuid() {
std::map<uint32_t, rocprofiler_uuid_t> index_to_prof_map;
std::map<uint32_t, rocprofiler_uuid_t> index_to_smi_map;
rdc_status_t RdcRocpBase::map_entity_to_profiler() {
// std::map<uint32_t, uint32_t> entity_to_index_map;
// kfd_id_t is only used inside this function
typedef uint64_t kfd_id_t;
std::map<uint32_t, kfd_id_t> prof_kfd_map;
// find intersection of supported and requested fields
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
index_to_prof_map.insert({gpu_index, agents[gpu_index].uuid});
amdsmi_processor_handle processor_handle = nullptr;
auto amdsmi_status = get_processor_handle_from_id(gpu_index, &processor_handle);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
continue;
}
amdsmi_asic_info_t asic_info;
amdsmi_status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
continue;
}
rocprofiler_uuid_t temp_id = asic_serial_to_uuid(asic_info.asic_serial);
index_to_smi_map.insert({gpu_index, temp_id});
// clang-format off
RDC_LOG(RDC_DEBUG, "\n"
"ID[" << gpu_index << "]:\n"
" PROF: " << uuid_to_string(index_to_prof_map[gpu_index]) << "\n"
" SMI: " << uuid_to_string(index_to_smi_map[gpu_index]));
// clang-format on
// populate profiler map
for (uint32_t prof_gpu_index = 0; prof_gpu_index < agents.size(); prof_gpu_index++) {
prof_kfd_map.insert({prof_gpu_index, agents[prof_gpu_index].gpu_id});
}
// Create a mapping from SMI to ROCProfiler by comparing uuid
for (const auto& [smi_index, smi_uuid] : index_to_smi_map) {
for (const auto& [prof_index, prof_uuid] : index_to_prof_map) {
if (std::memcmp(&smi_uuid, &prof_uuid, sizeof(rocprofiler_uuid_t)) == 0) {
// match found
smi_to_profiler_map[smi_index] = prof_index;
break;
std::vector<amdsmi_socket_handle> sockets;
auto amdsmi_status = get_socket_handles(sockets);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get socket handles: " << amdsmi_status);
return Smi2RdcError(amdsmi_status);
}
for (int socket_index = 0; socket_index < sockets.size(); socket_index++) {
auto* socket = sockets[socket_index];
std::vector<amdsmi_processor_handle> processors;
amdsmi_status = get_processor_handles(socket, processors);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get processor handles for socket " << socket_index << ": "
<< amdsmi_status);
return Smi2RdcError(amdsmi_status);
}
for (int processor_index = 0; processor_index < processors.size(); processor_index++) {
auto* processor = processors[processor_index];
processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
amdsmi_status = amdsmi_get_processor_type(processor, &processor_type);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get processor type for processor "
<< processor_index << " on socket " << socket_index << ": "
<< amdsmi_status);
return Smi2RdcError(amdsmi_status);
}
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
continue;
}
amdsmi_kfd_info_t kfd_info;
amdsmi_status = amdsmi_get_gpu_kfd_info(processor, &kfd_info);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get KFD info for processor "
<< processor_index << " on socket " << socket_index << ": "
<< amdsmi_status);
return Smi2RdcError(amdsmi_status);
}
rdc_entity_info_t entity_info = {
.device_index = static_cast<uint32_t>(socket_index),
.instance_index = static_cast<uint32_t>(processor_index),
.entity_role = RDC_DEVICE_ROLE_PHYSICAL,
.device_type = RDC_DEVICE_TYPE_GPU,
};
uint32_t entity_index = rdc_get_entity_index_from_info(entity_info);
for (const auto& [prof_index, prof_id] : prof_kfd_map) {
if (std::memcmp(&kfd_info.kfd_id, &prof_id, sizeof(kfd_id_t)) == 0) {
// match found
// clang-format off
RDC_LOG(RDC_DEBUG, "SMI[" << entity_index << "] <-> Profiler[" << prof_index << "] = KFD_ID[" << prof_id << "]");
// clang-format on
if (entity_info.entity_role == RDC_DEVICE_ROLE_PHYSICAL) {
entity_index = rdc_get_entity_index_from_info(entity_info);
entity_to_prof_map.insert({entity_index, prof_index});
}
if (processors.size() > 1) {
// if there are multiple processors, also add entity with partition instance type
entity_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
entity_index = rdc_get_entity_index_from_info(entity_info);
entity_to_prof_map.insert({entity_index, prof_index});
}
break;
}
}
}
}
for (const auto& [smi_index, prof_index] : smi_to_profiler_map) {
const auto& prof_uuid = index_to_prof_map[prof_index];
const auto& smi_uuid = index_to_smi_map[smi_index];
RDC_LOG(RDC_DEBUG, "SMI index " << smi_index << " maps to ROCProfiler index " << prof_index
<< " with UUID: " << uuid_to_string(prof_uuid) << " = "
<< uuid_to_string(smi_uuid));
}
return RDC_ST_OK;
}
RdcRocpBase::RdcRocpBase() {
@@ -226,7 +261,8 @@ RdcRocpBase::RdcRocpBase() {
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
};
hsa_status_t status = hsa_init();
@@ -251,23 +287,22 @@ RdcRocpBase::RdcRocpBase() {
RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size());
samplers = CounterSampler::get_samplers();
map_smi_to_profiler_by_uuid();
map_entity_to_profiler();
// find intersection of supported and requested fields
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
auto& cs = *samplers[gpu_index];
RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id
<< "] agent_id[" << agents[gpu_index].id.handle << "]");
uint32_t agent_index = 0;
auto& cs = *samplers[agent_index];
RDC_LOG(RDC_DEBUG, "agent_index[" << agent_index << "] location_id["
<< agents[agent_index].location_id << "]");
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
checked_fields.emplace_back(str);
}
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
checked_fields.emplace_back(str);
}
for (const auto& [k, v] : temp_field_map_k) {
auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
if (found != checked_fields.end()) {
field_to_metric.insert({k, v});
}
for (const auto& [k, v] : temp_field_map_k) {
auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
if (found != checked_fields.end()) {
field_to_metric.insert({k, v});
}
}
@@ -276,7 +311,7 @@ RdcRocpBase::RdcRocpBase() {
all_fields.emplace_back(v);
}
RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields");
RDC_LOG(RDC_DEBUG, "Profiler supports " << field_to_metric.size() << " fields");
}
RdcRocpBase::~RdcRocpBase() {
@@ -292,7 +327,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
// default type
*type = DOUBLE;
const auto& gpu_index = smi_to_profiler_map[gpu_field.gpu_index];
// convert from entity to flat index
uint32_t agent_index = entity_to_prof_map[gpu_field.gpu_index];
const auto& field = gpu_field.field_id;
if (data == nullptr) {
@@ -303,7 +339,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
const auto start_time = std::chrono::high_resolution_clock::now();
// direct read from rocprofiler
const double read_dbl = run_profiler(gpu_index, field);
const double read_dbl = run_profiler(agent_index, field);
const auto stop_time = std::chrono::high_resolution_clock::now();
const double elapsed = std::chrono::duration<double, std::milli>(stop_time - start_time).count();
// divide by elapsed time if needed
@@ -330,8 +366,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
// function
const double active_cycles_val = read_dbl;
if (active_cycles_val != 0.0) {
// read second value from rocprofiler
const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
// read second value from profiler
const double occupancy_val = run_profiler(agent_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
data->dbl = occupancy_val / active_cycles_val;
} else {
return RDC_ST_BAD_PARAMETER;
@@ -343,14 +379,14 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
return RDC_ST_BAD_PARAMETER;
}
// 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper
const std::string target_version = agents[gpu_index].name;
const std::string target_version = agents[agent_index].name;
// TODO: Design a lookup table for other GPUs
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
// FLOPS/clock/CU
if (isMI200) {
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
} else { // Assume mi300
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
}
} break;
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
@@ -360,15 +396,21 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
return RDC_ST_BAD_PARAMETER;
}
// FLOPS/clock/CU
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
break;
case RDC_FI_PROF_UUID: {
// do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
*type = STRING;
std::string uuid_str = uuid_to_string(agents[gpu_index].uuid);
std::string uuid_str = uuid_to_string(agents[agent_index].uuid);
strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length());
break;
}
case RDC_FI_PROF_KFD_ID: {
// do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
*type = INTEGER;
data->l_int = agents[agent_index].gpu_id;
break;
}
default:
// only support default fallback for doubles
assert(*type == DOUBLE);