Fix rocprofiler plugin

- Replace non-working fields with working ones
    - remove CU_OCCUPANCY completely as it isn't well supported
- Fix rocprofiler initialization with shared_ptr and rdc_module_init
- Replace env var ROCPROFILER_METRICS_PATH with ROCP_METRICS
    - ROCPROFILER_METRICS_PATH is only relevant for rocprofv2
    - ROCP_METRICS is only relevant for rocprofv1 (which we are using)

Change-Id: I21e6fa3f0e1694c38f44ca0e5659d672559f7380
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 20ca2ce574]
This commit is contained in:
Galantsev, Dmitrii
2024-06-04 19:37:42 -05:00
parent c2a75bbe4c
commit 29b86095ed
8 changed files with 62 additions and 37 deletions
+10 -11
View File
@@ -102,18 +102,17 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
// metrics below are divided by time passed
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
@@ -128,7 +128,6 @@ int run() {
field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE);
field_ids.push_back(RDC_FI_POWER_USAGE);
// profiler metrics
field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY);
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU);
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU);
field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES);
+1 -3
View File
@@ -250,13 +250,11 @@ typedef enum {
/**
* @brief ROC-profiler related fields
*/
RDC_FI_PROF_CU_OCCUPANCY = 800,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
RDC_FI_PROF_ACTIVE_CYCLES,
RDC_FI_PROF_ACTIVE_WAVES,
RDC_FI_PROF_ELAPSED_CYCLES,
// metrics below are divided by time passed
RDC_FI_PROF_EVAL_MEM_R_BW,
RDC_FI_PROF_EVAL_MEM_W_BW,
@@ -56,13 +56,15 @@ class RdcRocpLib : public RdcTelemetry {
rdc_field_value_f callback, void* user_data);
rdc_status_t (*telemetry_fields_watch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t (*telemetry_fields_unwatch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t (*rdc_module_init_)(uint64_t);
rdc_status_t (*rdc_module_destroy_)();
/**
* @brief Extract current ROCM_PATH from library or the environment
*/
std::string get_rocm_path();
/**
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
* librocprofiler
* @brief Set ROCP_METRICS environment variable needed by rocprofiler
*/
rdc_status_t set_rocprofiler_path();
};
+28 -13
View File
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <string>
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
namespace amd {
namespace rdc {
@@ -37,7 +38,9 @@ RdcRocpLib::RdcRocpLib()
: telemetry_fields_query_(nullptr),
telemetry_fields_value_get_(nullptr),
telemetry_fields_watch_(nullptr),
telemetry_fields_unwatch_(nullptr) {
telemetry_fields_unwatch_(nullptr),
rdc_module_init_(nullptr),
rdc_module_destroy_(nullptr) {
rdc_status_t status = set_rocprofiler_path();
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
@@ -51,6 +54,24 @@ RdcRocpLib::RdcRocpLib()
return;
}
status = lib_loader_.load_symbol(&rdc_module_init_, "rdc_module_init");
if (status != RDC_ST_OK) {
rdc_module_init_ = nullptr;
return;
}
status = rdc_module_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_rocp.so:" << rdc_status_string(status)
<< ". ROCP related function will not work.");
return;
}
status = lib_loader_.load_symbol(&rdc_module_destroy_, "rdc_module_destroy");
if (status != RDC_ST_OK) {
rdc_module_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&telemetry_fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
telemetry_fields_query_ = nullptr;
@@ -163,37 +184,31 @@ std::string RdcRocpLib::get_rocm_path() {
}
rdc_status_t RdcRocpLib::set_rocprofiler_path() {
// librocprofiler64 requires ROCPROFILER_METRICS_PATH to be set
// rocprofiler requires ROCP_METRICS to be set
std::string rocprofiler_metrics_path =
get_rocm_path() + "/libexec/rocprofiler/counters/derived_counters.xml";
// set rocm prefix
int result = setenv("ROCPROFILER_METRICS_PATH", rocprofiler_metrics_path.c_str(), 0);
int result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0);
if (result != 0) {
RDC_LOG(RDC_ERROR, "setenv ROCPROFILER_METRICS_PATH failed! " << result);
RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result);
return RDC_ST_PERM_ERROR;
}
// check that env exists
const char* rocprofiler_metrics_env = getenv("ROCPROFILER_METRICS_PATH");
const char* rocprofiler_metrics_env = getenv("ROCP_METRICS");
if (rocprofiler_metrics_env == nullptr) {
RDC_LOG(RDC_ERROR, "ROCPROFILER_METRICS_PATH is not set!");
RDC_LOG(RDC_ERROR, "ROCP_METRICS is not set!");
return RDC_ST_NO_DATA;
}
// check that file can be accessed
std::ifstream test_file(rocprofiler_metrics_env);
if (!test_file.good()) {
RDC_LOG(RDC_ERROR, "failed to open ROCPROFILER_METRICS_PATH: " << rocprofiler_metrics_env);
RDC_LOG(RDC_ERROR, "failed to open ROCP_METRICS: " << rocprofiler_metrics_env);
return RDC_ST_FILE_ERROR;
}
result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0);
if (result != 0) {
RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result);
return RDC_ST_PERM_ERROR;
}
return RDC_ST_OK;
}
@@ -243,7 +243,6 @@ RdcRocpBase::RdcRocpBase() {
// all fields
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"},
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"},
{RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"},
@@ -36,14 +36,24 @@ THE SOFTWARE.
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
amd::rdc::RdcRocpBase rocp;
std::unique_ptr<amd::rdc::RdcRocpBase> rocp_p;
rdc_status_t rdc_module_init(uint64_t flags) { return RDC_ST_OK; }
rdc_status_t rdc_module_init(uint64_t flags) {
rocp_p = std::unique_ptr<amd::rdc::RdcRocpBase>(new amd::rdc::RdcRocpBase);
return RDC_ST_OK;
}
rdc_status_t rdc_module_destroy() {
rocp_p.reset();
return RDC_ST_OK;
}
// get supported field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
// extract all keys from counter_map
std::vector<rdc_field_t> fields = rocp.get_field_ids();
if (rocp_p == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
std::vector<rdc_field_t> fields = rocp_p->get_field_ids();
std::vector<uint32_t> counter_keys(fields.begin(), fields.end());
*field_count = counter_keys.size();
@@ -56,7 +66,10 @@ rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint32_t fields_count,
rdc_field_value_f callback, void* user_data) {
//
if (rocp_p == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
@@ -81,7 +94,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
bulk_count = 0;
}
status = rocp.rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data);
status = rocp_p->rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
@@ -83,7 +83,7 @@ void TestRdciGroup::Run(void) {
ASSERT_EQ(result, RDC_ST_OK);
}
rdc_gpu_group_t group_id;
rdc_gpu_group_t group_id = 0;
uint32_t count = 0;
rdc_group_info_t group_info;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];