Profiler - Fix eval fields
The 'value' pointer was being written to a lot and then used for reading within the same function. This likely caused issues all over RDC when reading the metrics. This commit changes it so *value is written to only once. Change-Id: I83c158c1e46c6ce46ff87d8a2e769f26ffa8c0da Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
This commit is contained in:
committad av
Galantsev, Dmitrii
förälder
24024f0e4f
incheckning
91be467cad
@@ -77,8 +77,10 @@ class RdcRocpBase {
|
||||
|
||||
// these fields must be divided by time passed
|
||||
std::unordered_set<rdc_field_t> eval_fields = {
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, RDC_FI_PROF_EVAL_FLOPS_16,
|
||||
RDC_FI_PROF_EVAL_FLOPS_32, RDC_FI_PROF_EVAL_FLOPS_64,
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW,
|
||||
RDC_FI_PROF_EVAL_FLOPS_16, RDC_FI_PROF_EVAL_FLOPS_32,
|
||||
RDC_FI_PROF_EVAL_FLOPS_64, RDC_FI_PROF_EVAL_FLOPS_16_PERCENT,
|
||||
RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, RDC_FI_PROF_EVAL_FLOPS_64_PERCENT,
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -50,28 +50,24 @@ namespace rdc {
|
||||
double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
|
||||
thread_local std::vector<rocprofiler_record_counter_t> records;
|
||||
|
||||
// initialize hsa. hsa_init() will also load the profiler libs under the hood
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
auto counter_sampler = CounterSampler::get_samplers()[gpu_index];
|
||||
if (!counter_sampler) {
|
||||
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index << std::endl);
|
||||
throw std::runtime_error("Counter sampler not found");
|
||||
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index);
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
auto field_it = field_to_metric.find(field);
|
||||
if (field_it == field_to_metric.end()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Error: Field " << field << " not found in field_to_metric map." << std::endl);
|
||||
throw std::out_of_range("Field not found in field_to_metric map");
|
||||
RDC_LOG(RDC_ERROR, "Error: Field " << field << " not found in field_to_metric map.");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
const std::string& metric_id = field_it->second;
|
||||
|
||||
try {
|
||||
counter_sampler->sample_counter_values({metric_id}, records, collection_duration_us_k);
|
||||
} catch (const std::exception& e) {
|
||||
RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what() << std::endl);
|
||||
throw;
|
||||
RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what());
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
// Aggregate counter values. Rocprof v1/v2 summed values across dimensions.
|
||||
@@ -86,9 +82,8 @@ double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
|
||||
const char* RdcRocpBase::get_field_id_from_name(rdc_field_t field) {
|
||||
auto it = field_to_metric.find(field);
|
||||
if (it == field_to_metric.end()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Error: Field ID " << field << " not found in field_to_metric map." << std::endl);
|
||||
throw std::out_of_range("Field ID not found in field_to_metric map");
|
||||
RDC_LOG(RDC_ERROR, "Error: Field ID " << field << " not found in field_to_metric map.");
|
||||
return "";
|
||||
}
|
||||
|
||||
return field_to_metric.at(field);
|
||||
@@ -111,7 +106,7 @@ RdcRocpBase::RdcRocpBase() {
|
||||
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
|
||||
{RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
|
||||
"MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs
|
||||
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"},
|
||||
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"}, // metric is divided by 100 to get percent
|
||||
// metrics below are divided by time passed
|
||||
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
|
||||
@@ -125,7 +120,8 @@ RdcRocpBase::RdcRocpBase() {
|
||||
{RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"},
|
||||
{RDC_FI_PROF_SM_ACTIVE, "VALUBusy"},
|
||||
{RDC_FI_PROF_OCC_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
|
||||
// RDC_FI_PROF_OCC_ELAPSED is derived from OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
|
||||
{RDC_FI_PROF_OCC_ELAPSED,
|
||||
"GRBM_GUI_ACTIVE"}, // this metric is derived from OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
|
||||
{RDC_FI_PROF_CPC_CPC_STAT_BUSY, "CPC_CPC_STAT_BUSY"},
|
||||
{RDC_FI_PROF_CPC_CPC_STAT_IDLE, "CPC_CPC_STAT_IDLE"},
|
||||
{RDC_FI_PROF_CPC_CPC_STAT_STALL, "CPC_CPC_STAT_STALL"},
|
||||
@@ -187,8 +183,8 @@ RdcRocpBase::RdcRocpBase() {
|
||||
samplers = CounterSampler::get_samplers();
|
||||
|
||||
// populate fields
|
||||
for (auto& [k, v] : temp_field_map_k) {
|
||||
all_fields.push_back(v);
|
||||
for (const auto& [k, v] : temp_field_map_k) {
|
||||
all_fields.emplace_back(v);
|
||||
}
|
||||
|
||||
// find intersection of supported and requested fields
|
||||
@@ -196,11 +192,11 @@ RdcRocpBase::RdcRocpBase() {
|
||||
auto& cs = *samplers[gpu_index];
|
||||
RDC_LOG(RDC_DEBUG,
|
||||
"gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id << "]");
|
||||
for (auto& [str, id] : cs.get_supported_counters(cs.get_agent())) {
|
||||
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
|
||||
checked_fields.emplace_back(str);
|
||||
}
|
||||
|
||||
for (auto& [k, v] : temp_field_map_k) {
|
||||
for (const auto& [k, v] : temp_field_map_k) {
|
||||
auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
|
||||
if (found != checked_fields.end()) {
|
||||
field_to_metric.insert({k, v});
|
||||
@@ -227,57 +223,86 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
const auto start_time = std::chrono::high_resolution_clock::now();
|
||||
*value = run_profiler(gpu_index, field);
|
||||
const auto stop_time = std::chrono::high_resolution_clock::now();
|
||||
// extra processing required
|
||||
if (eval_fields.find(field) != eval_fields.end()) {
|
||||
const auto elapsed =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
|
||||
*value = *value / elapsed;
|
||||
}
|
||||
const bool is_eval_field = (eval_fields.find(field) != eval_fields.end());
|
||||
|
||||
auto simd_to_cu = [this, gpu_index](double prof_value, double matrix_fp) {
|
||||
// profiler result of RDC_OPS_*_PER_SIMDCYCLE is per SIMD, RDC needs it per CU
|
||||
//
|
||||
return prof_value / (matrix_fp / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
};
|
||||
const auto start_time = std::chrono::high_resolution_clock::now();
|
||||
const double read_value = run_profiler(gpu_index, field);
|
||||
const auto stop_time = std::chrono::high_resolution_clock::now();
|
||||
const double elapsed = std::chrono::duration<double, std::milli>(stop_time - start_time).count();
|
||||
double divided_value = NAN;
|
||||
double final_value = NAN;
|
||||
|
||||
if (is_eval_field) {
|
||||
if (elapsed != 0.0) {
|
||||
divided_value = read_value / (elapsed / 1000.0);
|
||||
} else {
|
||||
RDC_LOG(RDC_ERROR, "Error: Elapsed time is zero. Cannot divide by zero.");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
}
|
||||
|
||||
switch (field) {
|
||||
case RDC_FI_PROF_GPU_UTIL_PERCENT:
|
||||
// RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL
|
||||
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
|
||||
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
|
||||
*value = *value / 100.0F;
|
||||
final_value = read_value / 100.0F;
|
||||
break;
|
||||
case RDC_FI_PROF_OCC_ELAPSED: {
|
||||
const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
|
||||
const double active_cycles_val = run_profiler(gpu_index, RDC_FI_PROF_ACTIVE_CYCLES);
|
||||
// RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this
|
||||
// function
|
||||
const double active_cycles_val = read_value;
|
||||
if (active_cycles_val != 0.0) {
|
||||
*value = occupancy_val / active_cycles_val;
|
||||
return RDC_ST_OK;
|
||||
// read second value from rocprofiler
|
||||
const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
|
||||
final_value = occupancy_val / active_cycles_val;
|
||||
} else {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_PROF_EVAL_FLOPS_16_PERCENT: {
|
||||
if (!is_eval_field) {
|
||||
RDC_LOG(RDC_ERROR, "Field expected to be in the eval_fields list but it isn't!");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
// 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper
|
||||
const std::string target_version = agents[gpu_index].name;
|
||||
// TODO: Design a lookup table for other GPUs
|
||||
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
|
||||
// FLOPS/clock/CU
|
||||
if (isMI200) {
|
||||
*value = simd_to_cu(*value, 1024.0F); // FLOPS/clock/CU
|
||||
} else { // Assume mi300
|
||||
*value = simd_to_cu(*value, 2048.0F); // FLOPS/clock/CU
|
||||
final_value =
|
||||
divided_value / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
} else { // Assume mi300
|
||||
final_value =
|
||||
divided_value / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
|
||||
case RDC_FI_PROF_EVAL_FLOPS_64_PERCENT:
|
||||
*value = simd_to_cu(*value, 256.0F); // FLOPS/clock/CU
|
||||
if (!is_eval_field) {
|
||||
RDC_LOG(RDC_ERROR, "Field expected to be in the eval_fields list but it isn't!");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
// FLOPS/clock/CU
|
||||
final_value = divided_value / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
break;
|
||||
default:
|
||||
if (is_eval_field) {
|
||||
final_value = divided_value;
|
||||
} else {
|
||||
final_value = read_value;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (final_value == NAN) {
|
||||
RDC_LOG(RDC_ERROR, "Error: Final value is NaN.");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
*value = final_value;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ rdc_status_t rdc_module_init(uint64_t /*flags*/) {
|
||||
// before the tests are. Utilize an env var instead.
|
||||
return RDC_ST_DISABLED_MODULE;
|
||||
}
|
||||
rocp_p = std::unique_ptr<amd::rdc::RdcRocpBase>(new amd::rdc::RdcRocpBase);
|
||||
rocp_p = std::make_unique<amd::rdc::RdcRocpBase>();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
rdc_status_t rdc_module_destroy() {
|
||||
@@ -98,7 +98,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
|
||||
// Bulk fetch fields
|
||||
std::vector<rdc_gpu_field_value_t> bulk_results;
|
||||
|
||||
struct timeval tv {};
|
||||
struct timeval tv{};
|
||||
gettimeofday(&tv, nullptr);
|
||||
const uint64_t curTime = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
|
||||
|
||||
|
||||
Referens i nytt ärende
Block a user