Finalize the rocprofiler fields

Change-Id: I4ed1c4309f21bdcc7281d911663036caf5947182
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Tento commit je obsažen v:
Galantsev, Dmitrii
2024-06-04 19:37:42 -05:00
rodič e11afbf60f
revize 07c414af5e
5 změnil soubory, kde provedl 79 přidání a 57 odebrání
+12 -13
Zobrazit soubor
@@ -102,19 +102,18 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "CU_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "FETCH_SIZE", false)
FLD_DESC_ENT(RDC_FI_PROF_WRITE_SIZE, "kb written to video memory", "WRITE_SIZE", false)
FLD_DESC_ENT(RDC_FI_PROF_GRBM_COUNT, "", "GRBM_COUNT", false)
FLD_DESC_ENT(RDC_FI_PROF_SQ_WAVES, "", "SQ_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_TA_BUSY_AVR, "", "TA_BUSY_avr", false)
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
// metrics below are divided by time passed
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
+9 -9
Zobrazit soubor
@@ -127,19 +127,19 @@ int run() {
field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE);
field_ids.push_back(RDC_FI_POWER_USAGE);
field_ids.push_back(RDC_FI_PROF_CU_UTILIZATION);
// profiler metrics
field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY);
field_ids.push_back(RDC_FI_PROF_FLOPS_16);
field_ids.push_back(RDC_FI_PROF_FLOPS_32);
field_ids.push_back(RDC_FI_PROF_FLOPS_64);
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU);
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU);
field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES);
field_ids.push_back(RDC_FI_PROF_ACTIVE_WAVES);
field_ids.push_back(RDC_FI_PROF_ELAPSED_CYCLES);
field_ids.push_back(RDC_FI_PROF_FETCH_SIZE);
field_ids.push_back(RDC_FI_PROF_WRITE_SIZE);
field_ids.push_back(RDC_FI_PROF_GRBM_COUNT);
field_ids.push_back(RDC_FI_PROF_SQ_WAVES);
field_ids.push_back(RDC_FI_PROF_TA_BUSY_AVR);
// profiler metrics divided over time
field_ids.push_back(RDC_FI_PROF_EVAL_MEM_R_BW);
field_ids.push_back(RDC_FI_PROF_EVAL_MEM_W_BW);
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_16);
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_32);
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_64);
result = rdc_group_field_create(rdc_handle, field_ids.size(), field_ids.data(), field_group_name,
&field_group_id);
if (result != RDC_ST_OK) {
+14 -14
Zobrazit soubor
@@ -150,7 +150,7 @@ typedef enum {
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
RDC_FI_DEV_NAME, //!< Name of the device
/*
/**
* @brief Frequency related fields
*/
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
@@ -163,7 +163,7 @@ typedef enum {
RDC_FI_GPU_TEMP, //!< Current temperature for the device
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
/*
/**
* @brief PCIe related fields
*/
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
@@ -172,7 +172,7 @@ typedef enum {
// The RDC_FI_PCIE_BANDWIDTH should be used
RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in GB/sec
/*
/**
* @brief GPU usage related fields
*/
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
@@ -250,21 +250,21 @@ typedef enum {
/**
* @brief ROC-profiler related fields
*/
RDC_FI_PROF_CU_UTILIZATION = 800,
RDC_FI_PROF_CU_OCCUPANCY,
RDC_FI_PROF_FLOPS_16,
RDC_FI_PROF_FLOPS_32,
RDC_FI_PROF_FLOPS_64,
RDC_FI_PROF_CU_OCCUPANCY = 800,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
RDC_FI_PROF_ACTIVE_CYCLES,
RDC_FI_PROF_ACTIVE_WAVES,
RDC_FI_PROF_ELAPSED_CYCLES,
RDC_FI_PROF_FETCH_SIZE,
RDC_FI_PROF_WRITE_SIZE,
RDC_FI_PROF_GRBM_COUNT,
RDC_FI_PROF_SQ_WAVES,
RDC_FI_PROF_TA_BUSY_AVR,
/*
// metrics below are divided by time passed
RDC_FI_PROF_EVAL_MEM_R_BW,
RDC_FI_PROF_EVAL_MEM_W_BW,
RDC_FI_PROF_EVAL_FLOPS_16,
RDC_FI_PROF_EVAL_FLOPS_32,
RDC_FI_PROF_EVAL_FLOPS_64,
/**
* @brief Raw XGMI counter events
*/
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
+9 -2
Zobrazit soubor
@@ -24,8 +24,10 @@ THE SOFTWARE.
#define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
#include <rocprofiler/rocprofiler.h>
#include <chrono>
#include <cstdint>
#include <map>
#include <unordered_set>
#include <vector>
#include "rdc/rdc.h"
@@ -69,12 +71,17 @@ class RdcRocpBase {
std::map<const char*, double> metric_to_value = {};
// array of features for each device
std::map<uint32_t, rocprofiler_feature_t> feature;
// rocprofiler_feature_t features[dev_count][features_count] = {};
void read_feature(rocprofiler_t* context, const unsigned feature_count);
void read_feature(rocprofiler_t* context, const unsigned feature_count, uint32_t gpu_index);
int run_profiler(uint32_t gpu_index, rdc_field_t field);
std::vector<hsa_queue_t*> queues;
hsa_agent_arr_t agent_arr = {};
std::map<rdc_field_t, const char*> field_to_metric = {};
// these fields must be divided by time passed
std::unordered_set<rdc_field_t> eval_fields = {
RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, RDC_FI_PROF_EVAL_FLOPS_16,
RDC_FI_PROF_EVAL_FLOPS_32, RDC_FI_PROF_EVAL_FLOPS_64,
};
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> start_time;
/**
* @brief Convert from rocmtools status into RDC status
+35 -19
Zobrazit soubor
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <algorithm>
#include <cassert>
#include <chrono>
#include <csignal>
#include <cstdint>
#include <cstdio>
@@ -71,25 +72,26 @@ static hsa_status_t get_agent_handle_cb(hsa_agent_t agent, void* agent_arr) {
return HSA_STATUS_SUCCESS;
}
void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count) {
void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count,
uint32_t gpu_index) {
hsa_status_t status = rocprofiler_read(context, 0);
assert(status == HSA_STATUS_SUCCESS);
status = rocprofiler_get_data(context, 0);
assert(status == HSA_STATUS_SUCCESS);
status = rocprofiler_get_metrics(context);
assert(status == HSA_STATUS_SUCCESS);
switch (feature[0].data.kind) {
switch (feature[gpu_index].data.kind) {
case ROCPROFILER_DATA_KIND_DOUBLE:
metric_to_value[feature[0].name] = feature[0].data.result_double;
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_double;
break;
case ROCPROFILER_DATA_KIND_INT32:
metric_to_value[feature[0].name] = feature[0].data.result_int32;
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int32;
break;
case ROCPROFILER_DATA_KIND_INT64:
metric_to_value[feature[0].name] = feature[0].data.result_int64;
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int64;
break;
default:
RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[0].data.kind);
RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[gpu_index].data.kind);
}
}
@@ -168,7 +170,7 @@ int RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
status = rocprofiler_stop(contexts[gpu_index], 0);
assert(status == HSA_STATUS_SUCCESS);
read_feature(contexts[gpu_index], 1);
read_feature(contexts[gpu_index], 1, gpu_index);
usleep(100);
@@ -211,9 +213,14 @@ void check_metrics_supported(uint32_t node_id, std::vector<std::string>& metrics
payload_t payload = {&metrics_all, &metrics_good, node_id};
hsa_status_t status =
rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, &payload);
for (auto& iter : *(payload.metrics_good_)) {
RDC_LOG(RDC_DEBUG, iter << " : exists");
if (status != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
hsa_status_string(status, &errstr);
RDC_LOG(RDC_ERROR, "hsa error: " << std::to_string(status) << " " << errstr);
} else {
for (auto& iter : *(payload.metrics_good_)) {
RDC_LOG(RDC_DEBUG, iter << " : exists");
}
}
}
@@ -226,6 +233,7 @@ const std::vector<rdc_field_t> RdcRocpBase::get_field_ids() {
}
RdcRocpBase::RdcRocpBase() {
start_time = std::chrono::high_resolution_clock::now();
hsa_status_t status = hsa_init();
if (status != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
@@ -235,19 +243,18 @@ RdcRocpBase::RdcRocpBase() {
// all fields
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_CU_UTILIZATION, "CU_UTILIZATION"},
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
{RDC_FI_PROF_FLOPS_16, "FLOPS_16"},
{RDC_FI_PROF_FLOPS_32, "FLOPS_32"},
{RDC_FI_PROF_FLOPS_64, "FLOPS_64"},
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"},
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"},
{RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"},
{RDC_FI_PROF_ACTIVE_WAVES, "ACTIVE_WAVES"},
{RDC_FI_PROF_ELAPSED_CYCLES, "ELAPSED_CYCLES"},
{RDC_FI_PROF_FETCH_SIZE, "FETCH_SIZE"},
{RDC_FI_PROF_WRITE_SIZE, "WRITE_SIZE"},
{RDC_FI_PROF_GRBM_COUNT, "GRBM_COUNT"},
{RDC_FI_PROF_SQ_WAVES, "SQ_WAVES"},
{RDC_FI_PROF_TA_BUSY_AVR, "TA_BUSY_avr"},
// metrics below are divided by time passed
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
};
std::vector<std::string> unchecked_fields;
@@ -333,8 +340,17 @@ rdc_status_t RdcRocpBase::rocp_lookup(uint32_t gpu_index, rdc_field_t field, dou
}
switch (field) {
default:
const auto stop_time = std::chrono::high_resolution_clock::now();
run_profiler(gpu_index, field);
*value = metric_to_value[field_to_metric[field]];
// extra processing required
if (eval_fields.find(field) != eval_fields.end()) {
const auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
RDC_LOG(RDC_DEBUG, "INDEX: " << gpu_index << " before[" << *value << "] after["
<< (*value / elapsed) << "]");
*value = *value / elapsed;
}
break;
}
return Rocp2RdcError(status);