Remove rocmtools environment variable
- Set ROCMTOOLS_METRICS_PATH inside rdcd - Add nullptr checks for rocmtools library functions Change-Id: Ibbe4fed90df20e68b1a7971533765d831860c16f Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
此提交包含在:
@@ -113,10 +113,7 @@ For an RDC client application to monitor and/or control a remote system, the RDC
|
||||
When *rdcd* is started from a command-line the *capabilities* are determined by privilege of the *user* starting *rdcd*
|
||||
|
||||
```bash
|
||||
## Note that this environment variable is automatically set in rdc.service
|
||||
## If RDC_FI_PROF_* metrics are required - you MUST export ROCMTOOLS_METRICS_PATH before starting rdcd
|
||||
## NOTE: Replace /opt/rocm with specific rocm version if needed
|
||||
export ROCMTOOLS_METRICS_PATH=/opt/rocm/libexec/rocmtools/counters/derived_counters.xml
|
||||
|
||||
## To run with authentication. Ensure SSL keys are setup properly
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was packaged with
|
||||
@@ -188,9 +185,8 @@ ERROR, INFO, DEBUG logging levels are supported
|
||||
- Reading `RDC_FI_PROF_*` crashes rdcd
|
||||
- All `RDC_FI_PROF_*` metrics return N/A
|
||||
|
||||
1. Is `ROCMTOOLS_METRICS_PATH` set?
|
||||
2. Does your GPU support selected fields?
|
||||
1. Does your GPU support selected fields?
|
||||
Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs.
|
||||
Others are mostly intended for MI series.
|
||||
3. Set `RDC_LOG=DEBUG` as stated above
|
||||
4. Is rocmtools installed? Can you find `librocmtools.so`?
|
||||
2. Set `RDC_LOG=DEBUG` as stated above
|
||||
3. Is rocmtools installed? Can you find `librocmtools.so`?
|
||||
|
||||
@@ -56,8 +56,6 @@ class RdcRocpLib : public RdcTelemetry {
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
uint64_t get_profiler_version();
|
||||
|
||||
explicit RdcRocpLib(const char* lib_name);
|
||||
|
||||
~RdcRocpLib();
|
||||
@@ -82,6 +80,17 @@ class RdcRocpLib : public RdcTelemetry {
|
||||
rdc_status_t (*telemetry_fields_unwatch_)(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
|
||||
/**
|
||||
* @brief Extract current ROCM_PATH from library or the environment
|
||||
*/
|
||||
std::string get_rocm_path();
|
||||
|
||||
/**
|
||||
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
|
||||
* librocmtools
|
||||
*/
|
||||
rdc_status_t set_rocmtools_path();
|
||||
};
|
||||
|
||||
using RdcRocpLibPtr = std::shared_ptr<RdcRocpLib>;
|
||||
|
||||
@@ -22,7 +22,11 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/impl/RdcRocpLib.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
@@ -42,6 +46,12 @@ RdcRocpLib::RdcRocpLib(const char* lib_name)
|
||||
return;
|
||||
}
|
||||
|
||||
status = set_rocmtools_path();
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(
|
||||
&telemetry_fields_query_, "rdc_telemetry_fields_query");
|
||||
if (status != RDC_ST_OK) {
|
||||
@@ -76,6 +86,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(
|
||||
if (field_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (telemetry_fields_query_ == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
return telemetry_fields_query_(field_ids, field_count);
|
||||
}
|
||||
@@ -89,6 +102,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (telemetry_fields_value_get_ == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocp_lib.");
|
||||
|
||||
@@ -102,6 +118,10 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (telemetry_fields_watch_ == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
return telemetry_fields_watch_(fields, fields_count);
|
||||
}
|
||||
|
||||
@@ -111,8 +131,80 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (telemetry_fields_unwatch_ == nullptr) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
return telemetry_fields_unwatch_(fields, fields_count);
|
||||
}
|
||||
|
||||
std::string RdcRocpLib::get_rocm_path() {
|
||||
// set default rocm path in case lookup fails
|
||||
std::string rocm_path("/opt/rocm");
|
||||
const char* rocm_path_env = getenv("ROCM_PATH");
|
||||
if (rocm_path_env != nullptr) {
|
||||
rocm_path = rocm_path_env;
|
||||
}
|
||||
|
||||
std::ifstream file("/proc/self/maps");
|
||||
|
||||
if (!file.is_open()) {
|
||||
return rocm_path;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (getline(file, line)) {
|
||||
size_t index_end = line.find("librocmtools.so");
|
||||
size_t index_start = index_end;
|
||||
if (index_end == std::string::npos) {
|
||||
// no library on this line
|
||||
continue;
|
||||
}
|
||||
// walk index backwards until it reaches a space
|
||||
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
|
||||
index_start--;
|
||||
}
|
||||
// extract library path, drop library name
|
||||
rocm_path = line.substr(index_start, index_end - index_start);
|
||||
// appending "../" should result in "/opt/rocm/lib/.." or similar
|
||||
rocm_path += "..";
|
||||
return rocm_path;
|
||||
}
|
||||
|
||||
return rocm_path;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpLib::set_rocmtools_path() {
|
||||
// librocmtools requires ROCMTOOLS_METRICS_PATH to be set
|
||||
std::string rocmtools_metrics_path =
|
||||
get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml";
|
||||
|
||||
// set rocm prefix
|
||||
int result =
|
||||
setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0);
|
||||
if (result != 0) {
|
||||
RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result);
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
// check that env exists
|
||||
const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH");
|
||||
if (rocmtools_metrics_env == nullptr) {
|
||||
RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!");
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
|
||||
// check that file can be accessed
|
||||
std::ifstream test_file(rocmtools_metrics_env);
|
||||
if (!test_file.good()) {
|
||||
RDC_LOG(
|
||||
RDC_ERROR,
|
||||
"failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env);
|
||||
return RDC_ST_FILE_ERROR;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -18,11 +18,6 @@ Group=rdc
|
||||
|
||||
Type=simple
|
||||
|
||||
# Needed for accessing fields 700-799 provided by ROCMTools
|
||||
# At the time of writing - rdcd will crash if these metrics are accessed
|
||||
# without the following environment variable
|
||||
Environment=ROCMTOOLS_METRICS_PATH=/@CPACK_PACKAGING_INSTALL_PREFIX@/libexec/rocmtools/counters/derived_counters.xml
|
||||
|
||||
CapabilityBoundingSet=CAP_DAC_OVERRIDE
|
||||
AmbientCapabilities=CAP_DAC_OVERRIDE
|
||||
|
||||
|
||||
新增問題並參考
封鎖使用者