From 29b86095ede4c98d23caeabfb692ede5208ccbcd Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 4 Jun 2024 19:37:42 -0500 Subject: [PATCH] Fix rocprofiler plugin - Replace non-working fields with working ones - remove CU_OCCUPANCY completely as it isn't well supported - Fix rocprofiler initialization with shared_ptr and rdc_module_init - Replace env var ROCPROFILER_METRICS_PATH with ROCP_METRICS - ROCPROFILER_METRICS_PATH is only relevant for rocprofv2 - ROCP_METRICS is only relevant for rocprofv1 (which we are using) Change-Id: I21e6fa3f0e1694c38f44ca0e5659d672559f7380 Signed-off-by: Galantsev, Dmitrii [ROCm/rdc commit: 20ca2ce574bf6c8d17dfc656f53f840f775852fe] --- projects/rdc/common/rdc_field.data | 21 +++++----- projects/rdc/example/rocprofiler_example.cc | 1 - projects/rdc/include/rdc/rdc.h | 4 +- .../rdc/include/rdc_lib/impl/RdcRocpLib.h | 6 ++- projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc | 41 +++++++++++++------ .../rdc_modules/rdc_rocp/RdcRocpBase.cc | 1 - .../rdc_modules/rdc_rocp/RdcTelemetryLib.cc | 23 ++++++++--- .../tests/rdc_tests/functional/rdci_group.cc | 2 +- 8 files changed, 62 insertions(+), 37 deletions(-) diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index e10d57b266..a74e582353 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -102,18 +102,17 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB) // This doesn't map to rocprofiler counters directly // See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h // See metrics.xml in rocprofiler -FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false) -FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false) -FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false) -FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) +FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false) +FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false) +FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) // metrics below are divided by time passed -FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/projects/rdc/example/rocprofiler_example.cc b/projects/rdc/example/rocprofiler_example.cc index 6219099ed0..48c69e94c5 100644 --- a/projects/rdc/example/rocprofiler_example.cc +++ b/projects/rdc/example/rocprofiler_example.cc @@ -128,7 +128,6 @@ int run() { field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE); field_ids.push_back(RDC_FI_POWER_USAGE); // profiler metrics - field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY); field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU); field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU); field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES); diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 99c7c16d74..d6ee5fc12b 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -250,13 +250,11 @@ typedef enum { /** * @brief ROC-profiler related fields */ - RDC_FI_PROF_CU_OCCUPANCY = 800, - RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, + RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800, RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, RDC_FI_PROF_ACTIVE_CYCLES, RDC_FI_PROF_ACTIVE_WAVES, RDC_FI_PROF_ELAPSED_CYCLES, - // metrics below are divided by time passed RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, diff --git a/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h b/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h index 95897ce2cf..8501e3fe76 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h @@ -56,13 +56,15 @@ class RdcRocpLib : public RdcTelemetry { rdc_field_value_f callback, void* user_data); rdc_status_t (*telemetry_fields_watch_)(rdc_gpu_field_t* fields, uint32_t fields_count); rdc_status_t (*telemetry_fields_unwatch_)(rdc_gpu_field_t* fields, uint32_t fields_count); + + rdc_status_t (*rdc_module_init_)(uint64_t); + rdc_status_t (*rdc_module_destroy_)(); /** * @brief Extract current ROCM_PATH from library or the environment */ std::string get_rocm_path(); /** - * @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by - * librocprofiler + * @brief Set ROCP_METRICS environment variable needed by rocprofiler */ rdc_status_t set_rocprofiler_path(); }; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc index 347e587101..de91e5e25f 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "rdc_lib/RdcException.h" +#include "rdc_lib/RdcTelemetryLibInterface.h" namespace amd { namespace rdc { @@ -37,7 +38,9 @@ RdcRocpLib::RdcRocpLib() : telemetry_fields_query_(nullptr), telemetry_fields_value_get_(nullptr), telemetry_fields_watch_(nullptr), - telemetry_fields_unwatch_(nullptr) { + telemetry_fields_unwatch_(nullptr), + rdc_module_init_(nullptr), + rdc_module_destroy_(nullptr) { rdc_status_t status = set_rocprofiler_path(); if (status != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Rocp related function will not work."); @@ -51,6 +54,24 @@ RdcRocpLib::RdcRocpLib() return; } + status = lib_loader_.load_symbol(&rdc_module_init_, "rdc_module_init"); + if (status != RDC_ST_OK) { + rdc_module_init_ = nullptr; + return; + } + + status = rdc_module_init_(0); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to init librdc_rocp.so:" << rdc_status_string(status) + << ". ROCP related function will not work."); + return; + } + + status = lib_loader_.load_symbol(&rdc_module_destroy_, "rdc_module_destroy"); + if (status != RDC_ST_OK) { + rdc_module_destroy_ = nullptr; + } + status = lib_loader_.load_symbol(&telemetry_fields_query_, "rdc_telemetry_fields_query"); if (status != RDC_ST_OK) { telemetry_fields_query_ = nullptr; @@ -163,37 +184,31 @@ std::string RdcRocpLib::get_rocm_path() { } rdc_status_t RdcRocpLib::set_rocprofiler_path() { - // librocprofiler64 requires ROCPROFILER_METRICS_PATH to be set + // rocprofiler requires ROCP_METRICS to be set std::string rocprofiler_metrics_path = get_rocm_path() + "/libexec/rocprofiler/counters/derived_counters.xml"; // set rocm prefix - int result = setenv("ROCPROFILER_METRICS_PATH", rocprofiler_metrics_path.c_str(), 0); + int result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0); if (result != 0) { - RDC_LOG(RDC_ERROR, "setenv ROCPROFILER_METRICS_PATH failed! " << result); + RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result); return RDC_ST_PERM_ERROR; } // check that env exists - const char* rocprofiler_metrics_env = getenv("ROCPROFILER_METRICS_PATH"); + const char* rocprofiler_metrics_env = getenv("ROCP_METRICS"); if (rocprofiler_metrics_env == nullptr) { - RDC_LOG(RDC_ERROR, "ROCPROFILER_METRICS_PATH is not set!"); + RDC_LOG(RDC_ERROR, "ROCP_METRICS is not set!"); return RDC_ST_NO_DATA; } // check that file can be accessed std::ifstream test_file(rocprofiler_metrics_env); if (!test_file.good()) { - RDC_LOG(RDC_ERROR, "failed to open ROCPROFILER_METRICS_PATH: " << rocprofiler_metrics_env); + RDC_LOG(RDC_ERROR, "failed to open ROCP_METRICS: " << rocprofiler_metrics_env); return RDC_ST_FILE_ERROR; } - result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0); - if (result != 0) { - RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result); - return RDC_ST_PERM_ERROR; - } - return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index a304640854..299beb8301 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -243,7 +243,6 @@ RdcRocpBase::RdcRocpBase() { // all fields static const std::map temp_field_map_k = { - {RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"}, {RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"}, {RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"}, {RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"}, diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc index efbf665d70..bb24ca0c1a 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc @@ -36,14 +36,24 @@ THE SOFTWARE. #include "rdc_lib/rdc_common.h" #include "rdc_modules/rdc_rocp/RdcRocpBase.h" -amd::rdc::RdcRocpBase rocp; +std::unique_ptr rocp_p; -rdc_status_t rdc_module_init(uint64_t flags) { return RDC_ST_OK; } +rdc_status_t rdc_module_init(uint64_t flags) { + rocp_p = std::unique_ptr(new amd::rdc::RdcRocpBase); + return RDC_ST_OK; +} +rdc_status_t rdc_module_destroy() { + rocp_p.reset(); + return RDC_ST_OK; +} // get supported field ids rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { // extract all keys from counter_map - std::vector fields = rocp.get_field_ids(); + if (rocp_p == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + std::vector fields = rocp_p->get_field_ids(); std::vector counter_keys(fields.begin(), fields.end()); *field_count = counter_keys.size(); @@ -56,7 +66,10 @@ rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint // Fetch rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint32_t fields_count, rdc_field_value_f callback, void* user_data) { - // + if (rocp_p == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + // Bulk fetch fields std::vector bulk_results; @@ -81,7 +94,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3 bulk_count = 0; } - status = rocp.rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data); + status = rocp_p->rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data); // get value values[bulk_count].gpu_index = fields[i].gpu_index; values[bulk_count].field_value.type = DOUBLE; diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_group.cc b/projects/rdc/tests/rdc_tests/functional/rdci_group.cc index 6a858aca38..f9341755fa 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_group.cc +++ b/projects/rdc/tests/rdc_tests/functional/rdci_group.cc @@ -83,7 +83,7 @@ void TestRdciGroup::Run(void) { ASSERT_EQ(result, RDC_ST_OK); } - rdc_gpu_group_t group_id; + rdc_gpu_group_t group_id = 0; uint32_t count = 0; rdc_group_info_t group_info; uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];