diff --git a/README.md b/README.md index 883d17d2e6..c1991980e1 100644 --- a/README.md +++ b/README.md @@ -113,10 +113,7 @@ For an RDC client application to monitor and/or control a remote system, the RDC When *rdcd* is started from a command-line the *capabilities* are determined by privilege of the *user* starting *rdcd* ```bash -## Note that this environment variable is automatically set in rdc.service -## If RDC_FI_PROF_* metrics are required - you MUST export ROCMTOOLS_METRICS_PATH before starting rdcd ## NOTE: Replace /opt/rocm with specific rocm version if needed -export ROCMTOOLS_METRICS_PATH=/opt/rocm/libexec/rocmtools/counters/derived_counters.xml ## To run with authentication. Ensure SSL keys are setup properly ## version will be the version number(ex:3.10.0) of ROCm where RDC was packaged with @@ -188,9 +185,8 @@ ERROR, INFO, DEBUG logging levels are supported - Reading `RDC_FI_PROF_*` crashes rdcd - All `RDC_FI_PROF_*` metrics return N/A - 1. Is `ROCMTOOLS_METRICS_PATH` set? - 2. Does your GPU support selected fields? + 1. Does your GPU support selected fields? Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs. Others are mostly intended for MI series. - 3. Set `RDC_LOG=DEBUG` as stated above - 4. Is rocmtools installed? Can you find `librocmtools.so`? + 2. Set `RDC_LOG=DEBUG` as stated above + 3. Is rocmtools installed? Can you find `librocmtools.so`? diff --git a/include/rdc_lib/impl/RdcRocpLib.h b/include/rdc_lib/impl/RdcRocpLib.h index df1dde48ea..f793c46069 100644 --- a/include/rdc_lib/impl/RdcRocpLib.h +++ b/include/rdc_lib/impl/RdcRocpLib.h @@ -56,8 +56,6 @@ class RdcRocpLib : public RdcTelemetry { rdc_gpu_field_t* fields, uint32_t fields_count) override; - uint64_t get_profiler_version(); - explicit RdcRocpLib(const char* lib_name); ~RdcRocpLib(); @@ -82,6 +80,17 @@ class RdcRocpLib : public RdcTelemetry { rdc_status_t (*telemetry_fields_unwatch_)( rdc_gpu_field_t* fields, uint32_t fields_count); + + /** + * @brief Extract current ROCM_PATH from library or the environment + */ + std::string get_rocm_path(); + + /** + * @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by + * librocmtools + */ + rdc_status_t set_rocmtools_path(); }; using RdcRocpLibPtr = std::shared_ptr; diff --git a/rdc_libs/rdc/src/RdcRocpLib.cc b/rdc_libs/rdc/src/RdcRocpLib.cc index bc293de0b3..23b4ffe451 100644 --- a/rdc_libs/rdc/src/RdcRocpLib.cc +++ b/rdc_libs/rdc/src/RdcRocpLib.cc @@ -22,7 +22,11 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcRocpLib.h" #include +#include +#include +#include #include +#include #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -42,6 +46,12 @@ RdcRocpLib::RdcRocpLib(const char* lib_name) return; } + status = set_rocmtools_path(); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Rocp related function will not work."); + return; + } + status = lib_loader_.load_symbol( &telemetry_fields_query_, "rdc_telemetry_fields_query"); if (status != RDC_ST_OK) { @@ -76,6 +86,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_query( if (field_count == nullptr) { return RDC_ST_BAD_PARAMETER; } + if (telemetry_fields_query_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } return telemetry_fields_query_(field_ids, field_count); } @@ -89,6 +102,9 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get( if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } + if (telemetry_fields_value_get_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocp_lib."); @@ -102,6 +118,10 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch( if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } + if (telemetry_fields_watch_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + return telemetry_fields_watch_(fields, fields_count); } @@ -111,8 +131,80 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch( if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } + if (telemetry_fields_unwatch_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + return telemetry_fields_unwatch_(fields, fields_count); } +std::string RdcRocpLib::get_rocm_path() { + // set default rocm path in case lookup fails + std::string rocm_path("/opt/rocm"); + const char* rocm_path_env = getenv("ROCM_PATH"); + if (rocm_path_env != nullptr) { + rocm_path = rocm_path_env; + } + + std::ifstream file("/proc/self/maps"); + + if (!file.is_open()) { + return rocm_path; + } + + std::string line; + while (getline(file, line)) { + size_t index_end = line.find("librocmtools.so"); + size_t index_start = index_end; + if (index_end == std::string::npos) { + // no library on this line + continue; + } + // walk index backwards until it reaches a space + while ((index_start > 0) && (line[index_start - 1] != ' ')) { + index_start--; + } + // extract library path, drop library name + rocm_path = line.substr(index_start, index_end - index_start); + // appending "../" should result in "/opt/rocm/lib/.." or similar + rocm_path += ".."; + return rocm_path; + } + + return rocm_path; +} + +rdc_status_t RdcRocpLib::set_rocmtools_path() { + // librocmtools requires ROCMTOOLS_METRICS_PATH to be set + std::string rocmtools_metrics_path = + get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml"; + + // set rocm prefix + int result = + setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0); + if (result != 0) { + RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result); + return RDC_ST_PERM_ERROR; + } + + // check that env exists + const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH"); + if (rocmtools_metrics_env == nullptr) { + RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!"); + return RDC_ST_NO_DATA; + } + + // check that file can be accessed + std::ifstream test_file(rocmtools_metrics_env); + if (!test_file.good()) { + RDC_LOG( + RDC_ERROR, + "failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env); + return RDC_ST_FILE_ERROR; + } + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/server/rdc.service.in b/server/rdc.service.in index bd031de4a2..5ae7666903 100755 --- a/server/rdc.service.in +++ b/server/rdc.service.in @@ -18,11 +18,6 @@ Group=rdc Type=simple -# Needed for accessing fields 700-799 provided by ROCMTools -# At the time of writing - rdcd will crash if these metrics are accessed -# without the following environment variable -Environment=ROCMTOOLS_METRICS_PATH=/@CPACK_PACKAGING_INSTALL_PREFIX@/libexec/rocmtools/counters/derived_counters.xml - CapabilityBoundingSet=CAP_DAC_OVERRIDE AmbientCapabilities=CAP_DAC_OVERRIDE