diff --git a/README.md b/README.md index 65c4e60493..de75a06efc 100644 --- a/README.md +++ b/README.md @@ -578,34 +578,6 @@ The RAS plugin enables monitoring and counting of ECC (Error-Correcting Code) er > >#### 🐍 dmon RocProfiler Fields Return Zeros > ->**Solution:** -> ->Set the `HSA_TOOLS_LIB` environment variable **before** running a compute job. -> ->```bash ->export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1 ->``` -> ->**Example:** -> ->```bash -># Terminal 1 ->rdcd -u -> -># Terminal 2 ->export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1 ->gpu-burn -> -># Terminal 3 ->rdci dmon -u -e 800,801 -i 0 -c 1 -> -># Output: ->GPU OCCUPANCY_PERCENT ACTIVE_WAVES ->0 001.000 32640.000 ->``` -> ->#### ⚠️ `HSA_STATUS_ERROR_OUT_OF_RESOURCES` -> >**Error Message:** > >``` diff --git a/docs/how-to/using_RDC.rst b/docs/how-to/using_RDC.rst index c4d74e30d1..ef8c283565 100644 --- a/docs/how-to/using_RDC.rst +++ b/docs/how-to/using_RDC.rst @@ -309,34 +309,6 @@ Known issues - Limited metrics on MI200. - Consumer GPUs such as RX6800 have fewer supported metrics. -- dmon RocProfiler fields return zeros - - **Solution:** - - Set the ``HSA_TOOLS_LIB`` environment variable before running a compute job. - - .. code-block:: shell - - export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1 - - **Example:** - - .. code-block:: shell - - # Terminal 1 - rdcd -u - - # Terminal 2 - export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1 - gpu-burn - - # Terminal 3 - rdci dmon -u -e 800,801 -i 0 -c 1 - - # Output: - GPU OCCUPANCY_PERCENT ACTIVE_WAVES - 0 001.000 32640.000 - - HSA_STATUS_ERROR_OUT_OF_RESOURCES **Error message:** diff --git a/include/rdc_lib/impl/RdcRocpLib.h b/include/rdc_lib/impl/RdcRocpLib.h index 8501e3fe76..ed43c3c339 100644 --- a/include/rdc_lib/impl/RdcRocpLib.h +++ b/include/rdc_lib/impl/RdcRocpLib.h @@ -60,13 +60,12 @@ class RdcRocpLib : public RdcTelemetry { rdc_status_t (*rdc_module_init_)(uint64_t); rdc_status_t (*rdc_module_destroy_)(); /** - * @brief Extract current ROCM_PATH from library or the environment + * @brief Make sure HSA_TOOLS_LIB is not set as it breaks rocprofiler-sdk + * @details + * Rocprofilerv1 needed HSA_TOOLS_LIB set to librocprofiler64.so.1. + * That breaks rocprofiler-sdk because it tries to load both v1 and sdk libraries. */ - std::string get_rocm_path(); - /** - * @brief Set ROCP_METRICS environment variable needed by rocprofiler - */ - rdc_status_t set_rocprofiler_path(); + void rdc_unset_hsa_tools_lib(); }; using RdcRocpLibPtr = std::shared_ptr; diff --git a/rdc_libs/rdc/src/RdcRocpLib.cc b/rdc_libs/rdc/src/RdcRocpLib.cc index c26e527c8d..bb25b25450 100644 --- a/rdc_libs/rdc/src/RdcRocpLib.cc +++ b/rdc_libs/rdc/src/RdcRocpLib.cc @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include +#include "rdc/rdc.h" #include "rdc_lib/RdcException.h" #include "rdc_lib/RdcTelemetryLibInterface.h" @@ -41,14 +42,10 @@ RdcRocpLib::RdcRocpLib() telemetry_fields_unwatch_(nullptr), rdc_module_init_(nullptr), rdc_module_destroy_(nullptr) { - rdc_status_t status = set_rocprofiler_path(); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Rocp related function will not work."); - throw RdcException(RDC_ST_FAIL_LOAD_MODULE, "rocprofiler path could not be set"); - return; - } + // must happen before library is loaded + rdc_unset_hsa_tools_lib(); - status = lib_loader_.load("librdc_rocp.so"); + rdc_status_t status = lib_loader_.load("librdc_rocp.so"); if (status != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Rocp related function will not work."); return; @@ -152,68 +149,11 @@ rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, return telemetry_fields_unwatch_(fields, fields_count); } -std::string RdcRocpLib::get_rocm_path() { - // set default rocm path in case lookup fails - std::string rocm_path(ROCM_DIR); - const char* rocm_path_env = getenv("ROCM_PATH"); - if (rocm_path_env != nullptr) { - rocm_path = rocm_path_env; +void RdcRocpLib::rdc_unset_hsa_tools_lib() { + int status = unsetenv("HSA_TOOLS_LIB"); + if (status != 0) { + RDC_LOG(RDC_ERROR, "Failed to unset HSA_TOOLS_LIB environment variable."); } - - std::ifstream file("/proc/self/maps"); - - if (!file.is_open()) { - return rocm_path; - } - - std::string line; - while (getline(file, line)) { - size_t index_end = line.find("librocprofiler-register.so"); - size_t index_start = index_end; - if (index_end == std::string::npos) { - // no library on this line - continue; - } - // walk index backwards until it reaches a space - while ((index_start > 0) && (line[index_start - 1] != ' ')) { - index_start--; - } - // extract library path, drop library name - rocm_path = line.substr(index_start, index_end - index_start); - // appending "../" should result in "/opt/rocm/lib/.." or similar - rocm_path += ".."; - return rocm_path; - } - - return rocm_path; -} - -rdc_status_t RdcRocpLib::set_rocprofiler_path() { - // rocprofiler requires ROCPROFILER_METRICS_PATH to be set - std::string rocprofiler_metrics_path = get_rocm_path() + "/share/rocprofiler-sdk/"; - - // set rocm prefix - int result = setenv("ROCPROFILER_METRICS_PATH", rocprofiler_metrics_path.c_str(), 0); - if (result != 0) { - RDC_LOG(RDC_ERROR, "setenv ROCPROFILER_METRICS_PATH failed! " << result); - return RDC_ST_PERM_ERROR; - } - - // check that env exists - const char* rocprofiler_metrics_env = getenv("ROCPROFILER_METRICS_PATH"); - if (rocprofiler_metrics_env == nullptr) { - RDC_LOG(RDC_ERROR, "ROCPROFILER_METRICS_PATH is not set!"); - return RDC_ST_NO_DATA; - } - - // check that file can be accessed - std::ifstream test_file(rocprofiler_metrics_env); - if (!test_file.good()) { - RDC_LOG(RDC_ERROR, "failed to open ROCPROFILER_METRICS_PATH: " << rocprofiler_metrics_env); - return RDC_ST_FILE_ERROR; - } - - return RDC_ST_OK; } } // namespace rdc