From 07c414af5e87f2db4d25caecf8a01b5c18981004 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 4 Jun 2024 19:37:42 -0500 Subject: [PATCH] Finalize the rocprofiler fields Change-Id: I4ed1c4309f21bdcc7281d911663036caf5947182 Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 25 +++++---- example/rocprofiler_example.cc | 18 +++---- include/rdc/rdc.h | 28 +++++----- include/rdc_modules/rdc_rocp/RdcRocpBase.h | 11 +++- rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc | 54 +++++++++++++------- 5 files changed, 79 insertions(+), 57 deletions(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index 01d7fb71fc..e10d57b266 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -102,19 +102,18 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB) // This doesn't map to rocprofiler counters directly // See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h // See metrics.xml in rocprofiler -FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "CU_UTILIZATION", false) -FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false) -FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false) -FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false) -FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false) -FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) -FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "FETCH_SIZE", false) -FLD_DESC_ENT(RDC_FI_PROF_WRITE_SIZE, "kb written to video memory", "WRITE_SIZE", false) -FLD_DESC_ENT(RDC_FI_PROF_GRBM_COUNT, "", "GRBM_COUNT", false) -FLD_DESC_ENT(RDC_FI_PROF_SQ_WAVES, "", "SQ_WAVES", false) -FLD_DESC_ENT(RDC_FI_PROF_TA_BUSY_AVR, "", "TA_BUSY_avr", false) +FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false) +FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false) +FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false) +FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) +// metrics below are divided by time passed +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/example/rocprofiler_example.cc b/example/rocprofiler_example.cc index 837a9dfa3d..6219099ed0 100644 --- a/example/rocprofiler_example.cc +++ b/example/rocprofiler_example.cc @@ -127,19 +127,19 @@ int run() { field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE); field_ids.push_back(RDC_FI_POWER_USAGE); - field_ids.push_back(RDC_FI_PROF_CU_UTILIZATION); + // profiler metrics field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY); - field_ids.push_back(RDC_FI_PROF_FLOPS_16); - field_ids.push_back(RDC_FI_PROF_FLOPS_32); - field_ids.push_back(RDC_FI_PROF_FLOPS_64); + field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU); + field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU); field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES); field_ids.push_back(RDC_FI_PROF_ACTIVE_WAVES); field_ids.push_back(RDC_FI_PROF_ELAPSED_CYCLES); - field_ids.push_back(RDC_FI_PROF_FETCH_SIZE); - field_ids.push_back(RDC_FI_PROF_WRITE_SIZE); - field_ids.push_back(RDC_FI_PROF_GRBM_COUNT); - field_ids.push_back(RDC_FI_PROF_SQ_WAVES); - field_ids.push_back(RDC_FI_PROF_TA_BUSY_AVR); + // profiler metrics divided over time + field_ids.push_back(RDC_FI_PROF_EVAL_MEM_R_BW); + field_ids.push_back(RDC_FI_PROF_EVAL_MEM_W_BW); + field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_16); + field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_32); + field_ids.push_back(RDC_FI_PROF_EVAL_FLOPS_64); result = rdc_group_field_create(rdc_handle, field_ids.size(), field_ids.data(), field_group_name, &field_group_id); if (result != RDC_ST_OK) { diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index d31708576e..99c7c16d74 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -150,7 +150,7 @@ typedef enum { RDC_FI_GPU_COUNT = 1, //!< GPU count in the system RDC_FI_DEV_NAME, //!< Name of the device - /* + /** * @brief Frequency related fields */ RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU @@ -163,7 +163,7 @@ typedef enum { RDC_FI_GPU_TEMP, //!< Current temperature for the device RDC_FI_POWER_USAGE = 300, //!< Power usage for the device - /* + /** * @brief PCIe related fields */ RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information @@ -172,7 +172,7 @@ typedef enum { // The RDC_FI_PCIE_BANDWIDTH should be used RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in GB/sec - /* + /** * @brief GPU usage related fields */ RDC_FI_GPU_UTIL = 500, //!< GPU Utilization @@ -250,21 +250,21 @@ typedef enum { /** * @brief ROC-profiler related fields */ - RDC_FI_PROF_CU_UTILIZATION = 800, - RDC_FI_PROF_CU_OCCUPANCY, - RDC_FI_PROF_FLOPS_16, - RDC_FI_PROF_FLOPS_32, - RDC_FI_PROF_FLOPS_64, + RDC_FI_PROF_CU_OCCUPANCY = 800, + RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, + RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, RDC_FI_PROF_ACTIVE_CYCLES, RDC_FI_PROF_ACTIVE_WAVES, RDC_FI_PROF_ELAPSED_CYCLES, - RDC_FI_PROF_FETCH_SIZE, - RDC_FI_PROF_WRITE_SIZE, - RDC_FI_PROF_GRBM_COUNT, - RDC_FI_PROF_SQ_WAVES, - RDC_FI_PROF_TA_BUSY_AVR, - /* + // metrics below are divided by time passed + RDC_FI_PROF_EVAL_MEM_R_BW, + RDC_FI_PROF_EVAL_MEM_W_BW, + RDC_FI_PROF_EVAL_FLOPS_16, + RDC_FI_PROF_EVAL_FLOPS_32, + RDC_FI_PROF_EVAL_FLOPS_64, + + /** * @brief Raw XGMI counter events */ RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0 diff --git a/include/rdc_modules/rdc_rocp/RdcRocpBase.h b/include/rdc_modules/rdc_rocp/RdcRocpBase.h index 8c87031d73..25d1908d91 100644 --- a/include/rdc_modules/rdc_rocp/RdcRocpBase.h +++ b/include/rdc_modules/rdc_rocp/RdcRocpBase.h @@ -24,8 +24,10 @@ THE SOFTWARE. #define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_ #include +#include #include #include +#include #include #include "rdc/rdc.h" @@ -69,12 +71,17 @@ class RdcRocpBase { std::map metric_to_value = {}; // array of features for each device std::map feature; - // rocprofiler_feature_t features[dev_count][features_count] = {}; - void read_feature(rocprofiler_t* context, const unsigned feature_count); + void read_feature(rocprofiler_t* context, const unsigned feature_count, uint32_t gpu_index); int run_profiler(uint32_t gpu_index, rdc_field_t field); std::vector queues; hsa_agent_arr_t agent_arr = {}; std::map field_to_metric = {}; + // these fields must be divided by time passed + std::unordered_set eval_fields = { + RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, RDC_FI_PROF_EVAL_FLOPS_16, + RDC_FI_PROF_EVAL_FLOPS_32, RDC_FI_PROF_EVAL_FLOPS_64, + }; + std::chrono::time_point start_time; /** * @brief Convert from rocmtools status into RDC status diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 2a76a199e3..a304640854 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include +#include #include #include #include @@ -71,25 +72,26 @@ static hsa_status_t get_agent_handle_cb(hsa_agent_t agent, void* agent_arr) { return HSA_STATUS_SUCCESS; } -void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count) { +void RdcRocpBase::read_feature(rocprofiler_t* context, const unsigned feature_count, + uint32_t gpu_index) { hsa_status_t status = rocprofiler_read(context, 0); assert(status == HSA_STATUS_SUCCESS); status = rocprofiler_get_data(context, 0); assert(status == HSA_STATUS_SUCCESS); status = rocprofiler_get_metrics(context); assert(status == HSA_STATUS_SUCCESS); - switch (feature[0].data.kind) { + switch (feature[gpu_index].data.kind) { case ROCPROFILER_DATA_KIND_DOUBLE: - metric_to_value[feature[0].name] = feature[0].data.result_double; + metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_double; break; case ROCPROFILER_DATA_KIND_INT32: - metric_to_value[feature[0].name] = feature[0].data.result_int32; + metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int32; break; case ROCPROFILER_DATA_KIND_INT64: - metric_to_value[feature[0].name] = feature[0].data.result_int64; + metric_to_value[feature[gpu_index].name] = feature[gpu_index].data.result_int64; break; default: - RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[0].data.kind); + RDC_LOG(RDC_ERROR, "ERROR: Unexpected feature kind: " << feature[gpu_index].data.kind); } } @@ -168,7 +170,7 @@ int RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) { status = rocprofiler_stop(contexts[gpu_index], 0); assert(status == HSA_STATUS_SUCCESS); - read_feature(contexts[gpu_index], 1); + read_feature(contexts[gpu_index], 1, gpu_index); usleep(100); @@ -211,9 +213,14 @@ void check_metrics_supported(uint32_t node_id, std::vector& metrics payload_t payload = {&metrics_all, &metrics_good, node_id}; hsa_status_t status = rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, &payload); - - for (auto& iter : *(payload.metrics_good_)) { - RDC_LOG(RDC_DEBUG, iter << " : exists"); + if (status != HSA_STATUS_SUCCESS) { + const char* errstr = nullptr; + hsa_status_string(status, &errstr); + RDC_LOG(RDC_ERROR, "hsa error: " << std::to_string(status) << " " << errstr); + } else { + for (auto& iter : *(payload.metrics_good_)) { + RDC_LOG(RDC_DEBUG, iter << " : exists"); + } } } @@ -226,6 +233,7 @@ const std::vector RdcRocpBase::get_field_ids() { } RdcRocpBase::RdcRocpBase() { + start_time = std::chrono::high_resolution_clock::now(); hsa_status_t status = hsa_init(); if (status != HSA_STATUS_SUCCESS) { const char* errstr = nullptr; @@ -235,19 +243,18 @@ RdcRocpBase::RdcRocpBase() { // all fields static const std::map temp_field_map_k = { - {RDC_FI_PROF_CU_UTILIZATION, "CU_UTILIZATION"}, {RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"}, - {RDC_FI_PROF_FLOPS_16, "FLOPS_16"}, - {RDC_FI_PROF_FLOPS_32, "FLOPS_32"}, - {RDC_FI_PROF_FLOPS_64, "FLOPS_64"}, + {RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"}, + {RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"}, {RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"}, {RDC_FI_PROF_ACTIVE_WAVES, "ACTIVE_WAVES"}, {RDC_FI_PROF_ELAPSED_CYCLES, "ELAPSED_CYCLES"}, - {RDC_FI_PROF_FETCH_SIZE, "FETCH_SIZE"}, - {RDC_FI_PROF_WRITE_SIZE, "WRITE_SIZE"}, - {RDC_FI_PROF_GRBM_COUNT, "GRBM_COUNT"}, - {RDC_FI_PROF_SQ_WAVES, "SQ_WAVES"}, - {RDC_FI_PROF_TA_BUSY_AVR, "TA_BUSY_avr"}, + // metrics below are divided by time passed + {RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"}, + {RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"}, + {RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"}, + {RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"}, + {RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"}, }; std::vector unchecked_fields; @@ -333,8 +340,17 @@ rdc_status_t RdcRocpBase::rocp_lookup(uint32_t gpu_index, rdc_field_t field, dou } switch (field) { default: + const auto stop_time = std::chrono::high_resolution_clock::now(); run_profiler(gpu_index, field); *value = metric_to_value[field_to_metric[field]]; + // extra processing required + if (eval_fields.find(field) != eval_fields.end()) { + const auto elapsed = + std::chrono::duration_cast(stop_time - start_time).count(); + RDC_LOG(RDC_DEBUG, "INDEX: " << gpu_index << " before[" << *value << "] after[" + << (*value / elapsed) << "]"); + *value = *value / elapsed; + } break; } return Rocp2RdcError(status);