Remove rocmtools environment variable

- Set ROCMTOOLS_METRICS_PATH inside rdcd
- Add nullptr checks for rocmtools library functions

Change-Id: Ibbe4fed90df20e68b1a7971533765d831860c16f
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
此提交包含在:
Galantsev, Dmitrii
2023-01-16 15:23:01 -06:00
父節點 6687239cff
當前提交 35edaa2322
共有 4 個檔案被更改,包括 106 行新增14 行删除
+3 -7
查看文件
@@ -113,10 +113,7 @@ For an RDC client application to monitor and/or control a remote system, the RDC
When *rdcd* is started from a command-line the *capabilities* are determined by privilege of the *user* starting *rdcd*
```bash
## Note that this environment variable is automatically set in rdc.service
## If RDC_FI_PROF_* metrics are required - you MUST export ROCMTOOLS_METRICS_PATH before starting rdcd
## NOTE: Replace /opt/rocm with specific rocm version if needed
export ROCMTOOLS_METRICS_PATH=/opt/rocm/libexec/rocmtools/counters/derived_counters.xml
## To run with authentication. Ensure SSL keys are setup properly
## version will be the version number(ex:3.10.0) of ROCm where RDC was packaged with
@@ -188,9 +185,8 @@ ERROR, INFO, DEBUG logging levels are supported
- Reading `RDC_FI_PROF_*` crashes rdcd
- All `RDC_FI_PROF_*` metrics return N/A
1. Is `ROCMTOOLS_METRICS_PATH` set?
2. Does your GPU support selected fields?
1. Does your GPU support selected fields?
Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs.
Others are mostly intended for MI series.
3. Set `RDC_LOG=DEBUG` as stated above
4. Is rocmtools installed? Can you find `librocmtools.so`?
2. Set `RDC_LOG=DEBUG` as stated above
3. Is rocmtools installed? Can you find `librocmtools.so`?
+11 -2
查看文件
@@ -56,8 +56,6 @@ class RdcRocpLib : public RdcTelemetry {
rdc_gpu_field_t* fields,
uint32_t fields_count) override;
uint64_t get_profiler_version();
explicit RdcRocpLib(const char* lib_name);
~RdcRocpLib();
@@ -82,6 +80,17 @@ class RdcRocpLib : public RdcTelemetry {
rdc_status_t (*telemetry_fields_unwatch_)(
rdc_gpu_field_t* fields,
uint32_t fields_count);
/**
* @brief Extract current ROCM_PATH from library or the environment
*/
std::string get_rocm_path();
/**
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
* librocmtools
*/
rdc_status_t set_rocmtools_path();
};
using RdcRocpLibPtr = std::shared_ptr<RdcRocpLib>;
+92
查看文件
@@ -22,7 +22,11 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcRocpLib.h"
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <functional>
#include <string>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
@@ -42,6 +46,12 @@ RdcRocpLib::RdcRocpLib(const char* lib_name)
return;
}
status = set_rocmtools_path();
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
status = lib_loader_.load_symbol(
&telemetry_fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
@@ -76,6 +86,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_query_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_query_(field_ids, field_count);
}
@@ -89,6 +102,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_value_get_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocp_lib.");
@@ -102,6 +118,10 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_watch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_watch_(fields, fields_count);
}
@@ -111,8 +131,80 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_unwatch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_unwatch_(fields, fields_count);
}
std::string RdcRocpLib::get_rocm_path() {
// set default rocm path in case lookup fails
std::string rocm_path("/opt/rocm");
const char* rocm_path_env = getenv("ROCM_PATH");
if (rocm_path_env != nullptr) {
rocm_path = rocm_path_env;
}
std::ifstream file("/proc/self/maps");
if (!file.is_open()) {
return rocm_path;
}
std::string line;
while (getline(file, line)) {
size_t index_end = line.find("librocmtools.so");
size_t index_start = index_end;
if (index_end == std::string::npos) {
// no library on this line
continue;
}
// walk index backwards until it reaches a space
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
index_start--;
}
// extract library path, drop library name
rocm_path = line.substr(index_start, index_end - index_start);
// appending "../" should result in "/opt/rocm/lib/.." or similar
rocm_path += "..";
return rocm_path;
}
return rocm_path;
}
rdc_status_t RdcRocpLib::set_rocmtools_path() {
// librocmtools requires ROCMTOOLS_METRICS_PATH to be set
std::string rocmtools_metrics_path =
get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml";
// set rocm prefix
int result =
setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0);
if (result != 0) {
RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result);
return RDC_ST_PERM_ERROR;
}
// check that env exists
const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH");
if (rocmtools_metrics_env == nullptr) {
RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!");
return RDC_ST_NO_DATA;
}
// check that file can be accessed
std::ifstream test_file(rocmtools_metrics_env);
if (!test_file.good()) {
RDC_LOG(
RDC_ERROR,
"failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env);
return RDC_ST_FILE_ERROR;
}
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
-5
查看文件
@@ -18,11 +18,6 @@ Group=rdc
Type=simple
# Needed for accessing fields 700-799 provided by ROCMTools
# At the time of writing - rdcd will crash if these metrics are accessed
# without the following environment variable
Environment=ROCMTOOLS_METRICS_PATH=/@CPACK_PACKAGING_INSTALL_PREFIX@/libexec/rocmtools/counters/derived_counters.xml
CapabilityBoundingSet=CAP_DAC_OVERRIDE
AmbientCapabilities=CAP_DAC_OVERRIDE