Finalize the rocprofiler fields
Change-Id: I4ed1c4309f21bdcc7281d911663036caf5947182 Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Tento commit je obsažen v:
+12
-13
@@ -102,19 +102,18 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)
|
||||
// This doesn't map to rocprofiler counters directly
|
||||
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
|
||||
// See metrics.xml in rocprofiler
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "CU_UTILIZATION", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "FETCH_SIZE", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_WRITE_SIZE, "kb written to video memory", "WRITE_SIZE", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GRBM_COUNT, "", "GRBM_COUNT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_SQ_WAVES, "", "SQ_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_TA_BUSY_AVR, "", "TA_BUSY_avr", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
// metrics below are divided by time passed
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
|
||||
@@ -127,19 +127,19 @@ int run() {
|
||||
|
||||
field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE);
|
||||
field_ids.push_back(RDC_FI_POWER_USAGE);
|
||||
field_ids.push_back(RDC_FI_PROF_CU_UTILIZATION);
|
||||
// profiler metrics
|
||||
field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY);
|
||||
field_ids.push_back(RDC_FI_PROF_FLOPS_16);
|
||||
field_ids.push_back(RDC_FI_PROF_FLOPS_32);
|
||||
field_ids.push_back(RDC_FI_PROF_FLOPS_64);
|
||||
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU);
|
||||
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU);
|
||||
field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES);
|
||||
field_ids.push_back(RDC_FI_PROF_ACTIVE_WAVES);
|
||||
field_ids.push_back(RDC_FI_PROF_ELAPSED_CYCLES);
|
||||
field_ids.push_back(RDC_FI_PROF_FETCH_SIZE);
|
||||
field_ids.push_back(RDC_FI_PROF_WRITE_SIZE);
|
||||
field_ids.push_back(RDC_FI_PROF_GRBM_COUNT);
|
||||
field_ids.push_back(RDC_FI_PROF_SQ_WAVES);
|
||||
field_ids.push_back(RDC_FI_PROF_TA_BUSY_AVR);
|
||||
// profiler metrics divided over time
|
||||
field_ids.push_back(RDC_FI_PROF_EVAL_MEM_R_BW);
|
||||
field_ids.push_back(RDC_FI_PROF_EVAL_MEM_W_BW);
|
||||
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_16);
|
||||
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_32);
|
||||
field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_64);
|
||||
result = rdc_group_field_create(rdc_handle, field_ids.size(), field_ids.data(), field_group_name,
|
||||
&field_group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
|
||||
+14
-14
@@ -150,7 +150,7 @@ typedef enum {
|
||||
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
|
||||
RDC_FI_DEV_NAME, //!< Name of the device
|
||||
|
||||
/*
|
||||
/**
|
||||
* @brief Frequency related fields
|
||||
*/
|
||||
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
|
||||
@@ -163,7 +163,7 @@ typedef enum {
|
||||
RDC_FI_GPU_TEMP, //!< Current temperature for the device
|
||||
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
|
||||
|
||||
/*
|
||||
/**
|
||||
* @brief PCIe related fields
|
||||
*/
|
||||
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
|
||||
@@ -172,7 +172,7 @@ typedef enum {
|
||||
// The RDC_FI_PCIE_BANDWIDTH should be used
|
||||
RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in GB/sec
|
||||
|
||||
/*
|
||||
/**
|
||||
* @brief GPU usage related fields
|
||||
*/
|
||||
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
|
||||
@@ -250,21 +250,21 @@ typedef enum {
|
||||
/**
|
||||
* @brief ROC-profiler related fields
|
||||
*/
|
||||
RDC_FI_PROF_CU_UTILIZATION = 800,
|
||||
RDC_FI_PROF_CU_OCCUPANCY,
|
||||
RDC_FI_PROF_FLOPS_16,
|
||||
RDC_FI_PROF_FLOPS_32,
|
||||
RDC_FI_PROF_FLOPS_64,
|
||||
RDC_FI_PROF_CU_OCCUPANCY = 800,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
|
||||
RDC_FI_PROF_ACTIVE_CYCLES,
|
||||
RDC_FI_PROF_ACTIVE_WAVES,
|
||||
RDC_FI_PROF_ELAPSED_CYCLES,
|
||||
RDC_FI_PROF_FETCH_SIZE,
|
||||
RDC_FI_PROF_WRITE_SIZE,
|
||||
RDC_FI_PROF_GRBM_COUNT,
|
||||
RDC_FI_PROF_SQ_WAVES,
|
||||
RDC_FI_PROF_TA_BUSY_AVR,
|
||||
|
||||
/*
|
||||
// metrics below are divided by time passed
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW,
|
||||
RDC_FI_PROF_EVAL_MEM_W_BW,
|
||||
RDC_FI_PROF_EVAL_FLOPS_16,
|
||||
RDC_FI_PROF_EVAL_FLOPS_32,
|
||||
RDC_FI_PROF_EVAL_FLOPS_64,
|
||||
|
||||
/**
|
||||
* @brief Raw XGMI counter events
|
||||
*/
|
||||
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
|
||||
|
||||
@@ -24,8 +24,10 @@ THE SOFTWARE.
|
||||
#define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
|
||||
#include <rocprofiler/rocprofiler.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
@@ -69,12 +71,17 @@ class RdcRocpBase {
|
||||
std::map<const char*, double> metric_to_value = {};
|
||||
// array of features for each device
|
||||
std::map<uint32_t, rocprofiler_feature_t> feature;
|
||||
// rocprofiler_feature_t features[dev_count][features_count] = {};
|
||||
void read_feature(rocprofiler_t* context, const unsigned feature_count);
|
||||
void read_feature(rocprofiler_t* context, const unsigned feature_count, uint32_t gpu_index);
|
||||
int run_profiler(uint32_t gpu_index, rdc_field_t field);
|
||||
std::vector<hsa_queue_t*> queues;
|
||||
hsa_agent_arr_t agent_arr = {};
|
||||
std::map<rdc_field_t, const char*> field_to_metric = {};
|
||||
// these fields must be divided by time passed
|
||||
std::unordered_set<rdc_field_t> eval_fields = {
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, RDC_FI_PROF_EVAL_FLOPS_16,
|
||||
RDC_FI_PROF_EVAL_FLOPS_32, RDC_FI_PROF_EVAL_FLOPS_64,
|
||||
};
|
||||
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> start_time;
|
||||
|
||||
/**
|
||||
* @brief Convert from rocmtools status into RDC status
|
||||
|
||||
@@ -28,6 +28,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <csignal>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
@@ -71,25 +72,26 @@ static hsa_status_t get_agent_handle_cb(hsa_agent_t agent, void* agent_arr) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count) {
|
||||
void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count,
|
||||
uint32_t gpu_index) {
|
||||
hsa_status_t status = rocprofiler_read(context, 0);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
status = rocprofiler_get_data(context, 0);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
status = rocprofiler_get_metrics(context);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
switch (feature[0].data.kind) {
|
||||
switch (feature[gpu_index].data.kind) {
|
||||
case ROCPROFILER_DATA_KIND_DOUBLE:
|
||||
metric_to_value[feature[0].name] = feature[0].data.result_double;
|
||||
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_double;
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_INT32:
|
||||
metric_to_value[feature[0].name] = feature[0].data.result_int32;
|
||||
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int32;
|
||||
break;
|
||||
case ROCPROFILER_DATA_KIND_INT64:
|
||||
metric_to_value[feature[0].name] = feature[0].data.result_int64;
|
||||
metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int64;
|
||||
break;
|
||||
default:
|
||||
RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[0].data.kind);
|
||||
RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[gpu_index].data.kind);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,7 +170,7 @@ int RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
|
||||
status = rocprofiler_stop(contexts[gpu_index], 0);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
read_feature(contexts[gpu_index], 1);
|
||||
read_feature(contexts[gpu_index], 1, gpu_index);
|
||||
|
||||
usleep(100);
|
||||
|
||||
@@ -211,9 +213,14 @@ void check_metrics_supported(uint32_t node_id, std::vector<std::string>& metrics
|
||||
payload_t payload = {&metrics_all, &metrics_good, node_id};
|
||||
hsa_status_t status =
|
||||
rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, &payload);
|
||||
|
||||
for (auto& iter : *(payload.metrics_good_)) {
|
||||
RDC_LOG(RDC_DEBUG, iter << " : exists");
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
const char* errstr = nullptr;
|
||||
hsa_status_string(status, &errstr);
|
||||
RDC_LOG(RDC_ERROR, "hsa error: " << std::to_string(status) << " " << errstr);
|
||||
} else {
|
||||
for (auto& iter : *(payload.metrics_good_)) {
|
||||
RDC_LOG(RDC_DEBUG, iter << " : exists");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,6 +233,7 @@ const std::vector<rdc_field_t> RdcRocpBase::get_field_ids() {
|
||||
}
|
||||
|
||||
RdcRocpBase::RdcRocpBase() {
|
||||
start_time = std::chrono::high_resolution_clock::now();
|
||||
hsa_status_t status = hsa_init();
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
const char* errstr = nullptr;
|
||||
@@ -235,19 +243,18 @@ RdcRocpBase::RdcRocpBase() {
|
||||
|
||||
// all fields
|
||||
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
|
||||
{RDC_FI_PROF_CU_UTILIZATION, "CU_UTILIZATION"},
|
||||
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
|
||||
{RDC_FI_PROF_FLOPS_16, "FLOPS_16"},
|
||||
{RDC_FI_PROF_FLOPS_32, "FLOPS_32"},
|
||||
{RDC_FI_PROF_FLOPS_64, "FLOPS_64"},
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"},
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"},
|
||||
{RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"},
|
||||
{RDC_FI_PROF_ACTIVE_WAVES, "ACTIVE_WAVES"},
|
||||
{RDC_FI_PROF_ELAPSED_CYCLES, "ELAPSED_CYCLES"},
|
||||
{RDC_FI_PROF_FETCH_SIZE, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_WRITE_SIZE, "WRITE_SIZE"},
|
||||
{RDC_FI_PROF_GRBM_COUNT, "GRBM_COUNT"},
|
||||
{RDC_FI_PROF_SQ_WAVES, "SQ_WAVES"},
|
||||
{RDC_FI_PROF_TA_BUSY_AVR, "TA_BUSY_avr"},
|
||||
// metrics below are divided by time passed
|
||||
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
|
||||
};
|
||||
|
||||
std::vector<std::string> unchecked_fields;
|
||||
@@ -333,8 +340,17 @@ rdc_status_t RdcRocpBase::rocp_lookup(uint32_t gpu_index, rdc_field_t field, dou
|
||||
}
|
||||
switch (field) {
|
||||
default:
|
||||
const auto stop_time = std::chrono::high_resolution_clock::now();
|
||||
run_profiler(gpu_index, field);
|
||||
*value = metric_to_value[field_to_metric[field]];
|
||||
// extra processing required
|
||||
if (eval_fields.find(field) != eval_fields.end()) {
|
||||
const auto elapsed =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
|
||||
RDC_LOG(RDC_DEBUG, "INDEX: " << gpu_index << " before[" << *value << "] after["
|
||||
<< (*value / elapsed) << "]");
|
||||
*value = *value / elapsed;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return Rocp2RdcError(status);
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele