Fix rocprofiler plugin
- Replace non-working fields with working ones
- remove CU_OCCUPANCY completely as it isn't well supported
- Fix rocprofiler initialization with shared_ptr and rdc_module_init
- Replace env var ROCPROFILER_METRICS_PATH with ROCP_METRICS
- ROCPROFILER_METRICS_PATH is only relevant for rocprofv2
- ROCP_METRICS is only relevant for rocprofv1 (which we are using)
Change-Id: I21e6fa3f0e1694c38f44ca0e5659d672559f7380
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: 20ca2ce574]
This commit is contained in:
@@ -102,18 +102,17 @@ FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)
|
||||
// This doesn't map to rocprofiler counters directly
|
||||
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
|
||||
// See metrics.xml in rocprofiler
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "CU_OCCUPANCY", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MEAN_OCCUPANCY_PER_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MEAN_OCCUPANCY_PER_ACTIVE_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
// metrics below are divided by time passed
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "kbps fetched from video memory", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "kbps written to video memory", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / second", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / second", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / second", "FLOPS_64", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
|
||||
@@ -128,7 +128,6 @@ int run() {
|
||||
field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE);
|
||||
field_ids.push_back(RDC_FI_POWER_USAGE);
|
||||
// profiler metrics
|
||||
field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY);
|
||||
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU);
|
||||
field_ids.push_back(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU);
|
||||
field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES);
|
||||
|
||||
@@ -250,13 +250,11 @@ typedef enum {
|
||||
/**
|
||||
* @brief ROC-profiler related fields
|
||||
*/
|
||||
RDC_FI_PROF_CU_OCCUPANCY = 800,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
|
||||
RDC_FI_PROF_ACTIVE_CYCLES,
|
||||
RDC_FI_PROF_ACTIVE_WAVES,
|
||||
RDC_FI_PROF_ELAPSED_CYCLES,
|
||||
|
||||
// metrics below are divided by time passed
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW,
|
||||
RDC_FI_PROF_EVAL_MEM_W_BW,
|
||||
|
||||
@@ -56,13 +56,15 @@ class RdcRocpLib : public RdcTelemetry {
|
||||
rdc_field_value_f callback, void* user_data);
|
||||
rdc_status_t (*telemetry_fields_watch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
|
||||
rdc_status_t (*telemetry_fields_unwatch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
|
||||
|
||||
rdc_status_t (*rdc_module_init_)(uint64_t);
|
||||
rdc_status_t (*rdc_module_destroy_)();
|
||||
/**
|
||||
* @brief Extract current ROCM_PATH from library or the environment
|
||||
*/
|
||||
std::string get_rocm_path();
|
||||
/**
|
||||
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
|
||||
* librocprofiler
|
||||
* @brief Set ROCP_METRICS environment variable needed by rocprofiler
|
||||
*/
|
||||
rdc_status_t set_rocprofiler_path();
|
||||
};
|
||||
|
||||
@@ -28,6 +28,7 @@ THE SOFTWARE.
|
||||
#include <string>
|
||||
|
||||
#include "rdc_lib/RdcException.h"
|
||||
#include "rdc_lib/RdcTelemetryLibInterface.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -37,7 +38,9 @@ RdcRocpLib::RdcRocpLib()
|
||||
: telemetry_fields_query_(nullptr),
|
||||
telemetry_fields_value_get_(nullptr),
|
||||
telemetry_fields_watch_(nullptr),
|
||||
telemetry_fields_unwatch_(nullptr) {
|
||||
telemetry_fields_unwatch_(nullptr),
|
||||
rdc_module_init_(nullptr),
|
||||
rdc_module_destroy_(nullptr) {
|
||||
rdc_status_t status = set_rocprofiler_path();
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
|
||||
@@ -51,6 +54,24 @@ RdcRocpLib::RdcRocpLib()
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&rdc_module_init_, "rdc_module_init");
|
||||
if (status != RDC_ST_OK) {
|
||||
rdc_module_init_ = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
status = rdc_module_init_(0);
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to init librdc_rocp.so:" << rdc_status_string(status)
|
||||
<< ". ROCP related function will not work.");
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&rdc_module_destroy_, "rdc_module_destroy");
|
||||
if (status != RDC_ST_OK) {
|
||||
rdc_module_destroy_ = nullptr;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&telemetry_fields_query_, "rdc_telemetry_fields_query");
|
||||
if (status != RDC_ST_OK) {
|
||||
telemetry_fields_query_ = nullptr;
|
||||
@@ -163,37 +184,31 @@ std::string RdcRocpLib::get_rocm_path() {
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpLib::set_rocprofiler_path() {
|
||||
// librocprofiler64 requires ROCPROFILER_METRICS_PATH to be set
|
||||
// rocprofiler requires ROCP_METRICS to be set
|
||||
std::string rocprofiler_metrics_path =
|
||||
get_rocm_path() + "/libexec/rocprofiler/counters/derived_counters.xml";
|
||||
|
||||
// set rocm prefix
|
||||
int result = setenv("ROCPROFILER_METRICS_PATH", rocprofiler_metrics_path.c_str(), 0);
|
||||
int result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0);
|
||||
if (result != 0) {
|
||||
RDC_LOG(RDC_ERROR, "setenv ROCPROFILER_METRICS_PATH failed! " << result);
|
||||
RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result);
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
// check that env exists
|
||||
const char* rocprofiler_metrics_env = getenv("ROCPROFILER_METRICS_PATH");
|
||||
const char* rocprofiler_metrics_env = getenv("ROCP_METRICS");
|
||||
if (rocprofiler_metrics_env == nullptr) {
|
||||
RDC_LOG(RDC_ERROR, "ROCPROFILER_METRICS_PATH is not set!");
|
||||
RDC_LOG(RDC_ERROR, "ROCP_METRICS is not set!");
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
|
||||
// check that file can be accessed
|
||||
std::ifstream test_file(rocprofiler_metrics_env);
|
||||
if (!test_file.good()) {
|
||||
RDC_LOG(RDC_ERROR, "failed to open ROCPROFILER_METRICS_PATH: " << rocprofiler_metrics_env);
|
||||
RDC_LOG(RDC_ERROR, "failed to open ROCP_METRICS: " << rocprofiler_metrics_env);
|
||||
return RDC_ST_FILE_ERROR;
|
||||
}
|
||||
|
||||
result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0);
|
||||
if (result != 0) {
|
||||
RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result);
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
|
||||
@@ -243,7 +243,6 @@ RdcRocpBase::RdcRocpBase() {
|
||||
|
||||
// all fields
|
||||
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
|
||||
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MEAN_OCCUPANCY_PER_CU"},
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MEAN_OCCUPANCY_PER_ACTIVE_CU"},
|
||||
{RDC_FI_PROF_ACTIVE_CYCLES, "ACTIVE_CYCLES"},
|
||||
|
||||
@@ -36,14 +36,24 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
|
||||
|
||||
amd::rdc::RdcRocpBase rocp;
|
||||
std::unique_ptr<amd::rdc::RdcRocpBase> rocp_p;
|
||||
|
||||
rdc_status_t rdc_module_init(uint64_t flags) { return RDC_ST_OK; }
|
||||
rdc_status_t rdc_module_init(uint64_t flags) {
|
||||
rocp_p = std::unique_ptr<amd::rdc::RdcRocpBase>(new amd::rdc::RdcRocpBase);
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
rdc_status_t rdc_module_destroy() {
|
||||
rocp_p.reset();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
// get supported field ids
|
||||
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
|
||||
// extract all keys from counter_map
|
||||
std::vector<rdc_field_t> fields = rocp.get_field_ids();
|
||||
if (rocp_p == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
std::vector<rdc_field_t> fields = rocp_p->get_field_ids();
|
||||
std::vector<uint32_t> counter_keys(fields.begin(), fields.end());
|
||||
|
||||
*field_count = counter_keys.size();
|
||||
@@ -56,7 +66,10 @@ rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint
|
||||
// Fetch
|
||||
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint32_t fields_count,
|
||||
rdc_field_value_f callback, void* user_data) {
|
||||
//
|
||||
if (rocp_p == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
// Bulk fetch fields
|
||||
std::vector<rdc_gpu_field_value_t> bulk_results;
|
||||
|
||||
@@ -81,7 +94,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
|
||||
bulk_count = 0;
|
||||
}
|
||||
|
||||
status = rocp.rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data);
|
||||
status = rocp_p->rocp_lookup(fields[i].gpu_index, fields[i].field_id, &data);
|
||||
// get value
|
||||
values[bulk_count].gpu_index = fields[i].gpu_index;
|
||||
values[bulk_count].field_value.type = DOUBLE;
|
||||
|
||||
@@ -83,7 +83,7 @@ void TestRdciGroup::Run(void) {
|
||||
ASSERT_EQ(result, RDC_ST_OK);
|
||||
}
|
||||
|
||||
rdc_gpu_group_t group_id;
|
||||
rdc_gpu_group_t group_id = 0;
|
||||
uint32_t count = 0;
|
||||
rdc_group_info_t group_info;
|
||||
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
|
||||
|
||||
Reference in New Issue
Block a user