Profiler - Align SMI and Profiler indices

Change-Id: If2bb850ffd1c1b8b16a8f5963a0f6971f82d4863
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: eff955fdf7]
Этот коммит содержится в:
Galantsev, Dmitrii
2025-04-16 21:00:19 +00:00
коммит произвёл Galantsev, Dmitrii
родитель 71c654b0ee
Коммит 0d352c515e
8 изменённых файлов: 164 добавлений и 45 удалений
+9
Просмотреть файл
@@ -39,6 +39,13 @@ FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field",
FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true)
FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true)
FLD_DESC_ENT(RDC_FI_OAM_ID, "OAM ID of the device", "OAM_ID", true)
FLD_DESC_ENT(RDC_FI_DEV_ID, "ID of the device", "DEV_ID", true)
FLD_DESC_ENT(RDC_FI_REV_ID, "Revision ID of the device", "REV_ID", true)
FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", "GFX", true)
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEMORY_TEMP, "Memory temperature in millidegrees Celsius", "MEMORY_TEMP", true)
@@ -185,7 +192,9 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_STAT_STALL, "", "CPF_CPF_STAT_S
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_BUSY, "", "CPF_CPF_TCIU_BUSY", false)
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_IDLE", false)
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
// Misc
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
+2
Просмотреть файл
@@ -172,6 +172,7 @@ typedef enum {
RDC_FI_REV_ID, //!<
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
RDC_FI_UUID, //!< Device UUID
/**
* @brief Frequency related fields
@@ -342,6 +343,7 @@ typedef enum {
RDC_FI_PROF_CPF_CPF_TCIU_IDLE,
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
RDC_FI_PROF_SIMD_UTILIZATION,
RDC_FI_PROF_UUID,
/**
* @brief Raw XGMI counter events
+8 -1
Просмотреть файл
@@ -56,7 +56,8 @@ class RdcRocpBase {
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t rocp_lookup(rdc_gpu_field_t gpu_field, double* value);
rdc_status_t rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value_data* value,
rdc_field_type_t* type);
const char* get_field_id_from_name(rdc_field_t);
const std::vector<rdc_field_t> get_field_ids();
@@ -69,11 +70,17 @@ class RdcRocpBase {
static const uint32_t collection_duration_us_k = 10000;
double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index);
/**
* @brief By default all profiler values are read as doubles
*/
double run_profiler(uint32_t gpu_index, rdc_field_t field);
void map_smi_to_profiler_by_uuid();
std::vector<rocprofiler_agent_v0_t> agents = {};
std::vector<std::shared_ptr<CounterSampler>> samplers = {};
std::map<rdc_field_t, const char*> field_to_metric = {};
std::map<uint32_t, uint32_t> smi_to_profiler_map = {};
// these fields must be divided by time passed
std::unordered_set<rdc_field_t> eval_fields = {
+5 -1
Просмотреть файл
@@ -851,7 +851,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
case RDC_FI_DEV_ID:
case RDC_FI_REV_ID:
case RDC_FI_TARGET_GRAPHICS_VERSION:
case RDC_FI_NUM_OF_COMPUTE_UNITS: {
case RDC_FI_NUM_OF_COMPUTE_UNITS:
case RDC_FI_UUID: {
amdsmi_asic_info_t asic_info;
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
value->type = INTEGER;
@@ -881,6 +882,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
} else {
value->value.l_int = asic_info.num_of_compute_units;
}
} else if (field_id == RDC_FI_UUID) {
value->type = STRING;
memcpy(value->value.str, asic_info.asic_serial, sizeof(asic_info.asic_serial));
} else {
// this should never happen as all fields are handled above
RDC_LOG(RDC_ERROR, "Unexpected field id: " << field_id);
+3 -1
Просмотреть файл
@@ -184,7 +184,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID,
};
// clang-format on
std::copy(fields.begin(), fields.end(), field_ids);
+9 -4
Просмотреть файл
@@ -9,8 +9,12 @@ set(RDC_ROCP_LIB_COMPONENT "lib${RDC_ROCP_LIB}")
set(RDC_ROCP_LIB_SRC_LIST "${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc" "${SRC_DIR}/RdcTelemetryLib.cc"
"${SRC_DIR}/RdcRocpCounterSampler.cc" "${SRC_DIR}/RdcRocpBase.cc")
set(RDC_ROCP_LIB_INC_LIST
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h" "${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
"${RDC_LIB_INC_DIR}/rdc_common.h" "${RDC_LIB_INC_DIR}/RdcLogger.h" "${INC_DIR}/RdcRocpBase.h"
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
"${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
"${RDC_LIB_INC_DIR}/rdc_common.h"
"${RDC_LIB_INC_DIR}/RdcLogger.h"
"${SRC_DIR}/../../rdc/src/SmiUtils.cc"
"${INC_DIR}/RdcRocpBase.h"
"${INC_DIR}/RdcRocpCounterSampler.h")
if(BUILD_PROFILER)
@@ -31,8 +35,9 @@ if(BUILD_PROFILER)
${RDC_LIB_MODULES} ${RDC_ROCP_LIB}
PARENT_SCOPE)
add_library(${RDC_ROCP_LIB} SHARED ${RDC_ROCP_LIB_SRC_LIST} ${RDC_ROCP_LIB_INC_LIST})
target_link_libraries(${RDC_ROCP_LIB} PRIVATE hsa-runtime64::hsa-runtime64
rocprofiler-sdk::rocprofiler-sdk pthread dl)
target_link_libraries(
${RDC_ROCP_LIB} PRIVATE hsa-runtime64::hsa-runtime64 rocprofiler-sdk::rocprofiler-sdk
pthread dl amd_smi)
target_include_directories(
${RDC_ROCP_LIB}
PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}"
+109 -33
Просмотреть файл
@@ -35,13 +35,17 @@ THE SOFTWARE.
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <iomanip>
#include <stdexcept>
#include <vector>
// #include "hsa.h"
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpCounterSampler.h"
namespace amd {
@@ -97,6 +101,70 @@ const std::vector<rdc_field_t> RdcRocpBase::get_field_ids() {
return field_ids;
}
rocprofiler_uuid_t asic_serial_to_uuid(const char* asic_serial) {
rocprofiler_uuid_t uuid = {0};
// have to cast to stoull as a workaround for amdsmi ignoring leading zeroes
uuid.value = std::stoull(asic_serial, nullptr, 16);
return uuid;
}
std::string uuid_to_string(const uint64_t uuid) {
std::ostringstream oss;
oss << "0x" << std::hex << std::setw(16) << std::setfill('0') << uuid;
return oss.str();
}
std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); }
void RdcRocpBase::map_smi_to_profiler_by_uuid() {
std::map<uint32_t, rocprofiler_uuid_t> index_to_prof_map;
std::map<uint32_t, rocprofiler_uuid_t> index_to_smi_map;
// find intersection of supported and requested fields
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
index_to_prof_map.insert({gpu_index, agents[gpu_index].uuid});
amdsmi_processor_handle processor_handle = nullptr;
auto amdsmi_status = get_processor_handle_from_id(gpu_index, &processor_handle);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
continue;
}
amdsmi_asic_info_t asic_info;
amdsmi_status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
continue;
}
rocprofiler_uuid_t temp_id = asic_serial_to_uuid(asic_info.asic_serial);
index_to_smi_map.insert({gpu_index, temp_id});
// clang-format off
RDC_LOG(RDC_DEBUG, "\n"
"ID[" << gpu_index << "]:\n"
" PROF: " << uuid_to_string(index_to_prof_map[gpu_index]) << "\n"
" SMI: " << uuid_to_string(index_to_smi_map[gpu_index]));
// clang-format on
}
// Create a mapping from SMI to ROCProfiler by comparing uuid
for (const auto& [smi_index, smi_uuid] : index_to_smi_map) {
for (const auto& [prof_index, prof_uuid] : index_to_prof_map) {
if (std::memcmp(&smi_uuid, &prof_uuid, sizeof(rocprofiler_uuid_t)) == 0) {
// match found
smi_to_profiler_map[smi_index] = prof_index;
break;
}
}
}
for (const auto& [smi_index, prof_index] : smi_to_profiler_map) {
const auto& prof_uuid = index_to_prof_map[prof_index];
const auto& smi_uuid = index_to_smi_map[smi_index];
RDC_LOG(RDC_DEBUG, "SMI index " << smi_index << " maps to ROCProfiler index " << prof_index
<< " with UUID: " << uuid_to_string(prof_uuid) << " = "
<< uuid_to_string(smi_uuid));
}
}
RdcRocpBase::RdcRocpBase() {
// all fields
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
@@ -120,8 +188,8 @@ RdcRocpBase::RdcRocpBase() {
{RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"},
{RDC_FI_PROF_SM_ACTIVE, "VALUBusy"},
{RDC_FI_PROF_OCC_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
{RDC_FI_PROF_OCC_ELAPSED,
"GRBM_GUI_ACTIVE"}, // this metric is derived from OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
{RDC_FI_PROF_OCC_ELAPSED, "GRBM_GUI_ACTIVE"}, // this metric is derived from
// OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
{RDC_FI_PROF_CPC_CPC_STAT_BUSY, "CPC_CPC_STAT_BUSY"},
{RDC_FI_PROF_CPC_CPC_STAT_IDLE, "CPC_CPC_STAT_IDLE"},
{RDC_FI_PROF_CPC_CPC_STAT_STALL, "CPC_CPC_STAT_STALL"},
@@ -158,7 +226,7 @@ RdcRocpBase::RdcRocpBase() {
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
};
hsa_status_t status = hsa_init();
@@ -183,16 +251,14 @@ RdcRocpBase::RdcRocpBase() {
RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size());
samplers = CounterSampler::get_samplers();
// populate fields
for (const auto& [k, v] : temp_field_map_k) {
all_fields.emplace_back(v);
}
map_smi_to_profiler_by_uuid();
// find intersection of supported and requested fields
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
auto& cs = *samplers[gpu_index];
RDC_LOG(RDC_DEBUG,
"gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id << "]");
RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id
<< "] agent_id[" << agents[gpu_index].id.handle << "]");
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
checked_fields.emplace_back(str);
}
@@ -205,6 +271,11 @@ RdcRocpBase::RdcRocpBase() {
}
}
// populate fields
for (const auto& [k, v] : temp_field_map_k) {
all_fields.emplace_back(v);
}
RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields");
}
@@ -216,26 +287,31 @@ RdcRocpBase::~RdcRocpBase() {
assert(status == HSA_STATUS_ERROR_NOT_INITIALIZED);
}
rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) {
const auto& gpu_index = gpu_field.gpu_index;
rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value_data* data,
rdc_field_type_t* type) {
// default type
*type = DOUBLE;
const auto& gpu_index = smi_to_profiler_map[gpu_field.gpu_index];
const auto& field = gpu_field.field_id;
if (value == nullptr) {
if (data == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
const bool is_eval_field = (eval_fields.find(field) != eval_fields.end());
const auto start_time = std::chrono::high_resolution_clock::now();
const double read_value = run_profiler(gpu_index, field);
// direct read from rocprofiler
const double read_dbl = run_profiler(gpu_index, field);
const auto stop_time = std::chrono::high_resolution_clock::now();
const double elapsed = std::chrono::duration<double, std::milli>(stop_time - start_time).count();
double divided_value = NAN;
double final_value = NAN;
// divide by elapsed time if needed
double divided_dbl = NAN;
if (is_eval_field) {
if (elapsed != 0.0) {
divided_value = read_value / (elapsed / 1000.0);
divided_dbl = read_dbl / (elapsed / 1000.0);
} else {
RDC_LOG(RDC_ERROR, "Error: Elapsed time is zero. Cannot divide by zero.");
return RDC_ST_BAD_PARAMETER;
@@ -247,16 +323,16 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
// RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
final_value = read_value / 100.0F;
data->dbl = read_dbl / 100.0F;
break;
case RDC_FI_PROF_OCC_ELAPSED: {
// RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this
// function
const double active_cycles_val = read_value;
const double active_cycles_val = read_dbl;
if (active_cycles_val != 0.0) {
// read second value from rocprofiler
const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
final_value = occupancy_val / active_cycles_val;
data->dbl = occupancy_val / active_cycles_val;
} else {
return RDC_ST_BAD_PARAMETER;
}
@@ -272,11 +348,9 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
// FLOPS/clock/CU
if (isMI200) {
final_value =
divided_value / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
} else { // Assume mi300
final_value =
divided_value / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
}
} break;
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
@@ -286,24 +360,26 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
return RDC_ST_BAD_PARAMETER;
}
// FLOPS/clock/CU
final_value = divided_value / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
break;
case RDC_FI_PROF_UUID: {
// do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
*type = STRING;
std::string uuid_str = uuid_to_string(agents[gpu_index].uuid);
strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length());
break;
}
default:
// only support default fallback for doubles
assert(*type == DOUBLE);
if (is_eval_field) {
final_value = divided_value;
data->dbl = divided_dbl;
} else {
final_value = read_value;
data->dbl = read_dbl;
}
break;
}
if (final_value == NAN) {
RDC_LOG(RDC_ERROR, "Error: Final value is NaN.");
return RDC_ST_BAD_PARAMETER;
}
*value = final_value;
return RDC_ST_OK;
}
+19 -5
Просмотреть файл
@@ -98,7 +98,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
struct timeval tv{};
struct timeval tv {};
gettimeofday(&tv, nullptr);
const uint64_t curTime = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
@@ -107,7 +107,8 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
double data = NAN;
rdc_field_value_data data;
rdc_field_type_t type = DOUBLE;
for (uint32_t i = 0; i < fields_count; i++) {
if (bulk_count >= BULK_FIELDS_MAX) {
@@ -119,14 +120,27 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
bulk_count = 0;
}
status = rocp_p->rocp_lookup(fields[i], &data);
status = rocp_p->rocp_lookup(fields[i], &data, &type);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
values[bulk_count].field_value.status = status;
values[bulk_count].field_value.ts = curTime;
values[bulk_count].field_value.value.dbl = data;
values[bulk_count].field_value.type = type;
values[bulk_count].field_value.field_id = fields[i].field_id;
switch (type) {
case DOUBLE:
values[bulk_count].field_value.value.dbl = data.dbl;
break;
case INTEGER:
values[bulk_count].field_value.value.l_int = data.l_int;
break;
case STRING:
case BLOB:
strncpy_with_null(values[bulk_count].field_value.value.str, data.str, RDC_MAX_STR_LENGTH);
break;
default:
break;
}
bulk_count++;
}
if (bulk_count != 0) {