From 99bc3fb5024c5574e57248cba2c036cd49ef5894 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Mon, 1 May 2023 19:08:47 -0500 Subject: [PATCH] [SWDEV-398070] Adding logging to ROCm SMI (by default off) Updates: * [rocm-smi] Provide a thread-safe logging feature * [rocm-smi] Adding logrotation into install/upgrade/remove scripts * [rocm-smi] Updated cmake lists to include rocm_smi_logger * [rocm-smi] Updated DEB/RPM install/remove logging file & folder with all users having r/w privledges for /var/log/rocm_smi_lib/ROCm-SMI-lib.log * [rocm-smi] Added ability to do a glob search for multiple files (globFileExists), assists doing file searches with * strings * [rocm-smi] Added ability to log system details when RSMI_LOGGING is turned on (getSystemDetails()) * [rocm-smi] Added logging to provide which ROCm API is being called when RSMI_LOGGING is on * [rocm-smi] Added logging to provide SYSFS path and read value, when RSMI_LOGGING is on. Provides error reponse on failure. * [rocm-smi] Added logging to provide SYSFS path and read value, when RSMI_LOGGING is on. Provides error reponse on failure. * [rocm-smi] Added environment variable RSMI_LOGGING to control when logging is enabled or disabled. By default, by not setting this env. variable, logging is turned off. When setting RSMI_LOGGING=, logging is enabled which is placed in /var/log/rocm_smi_lib/ROCm-SMI-lib.log file. Setting RSMI_LOGGING is allowed in both debug and release builds. * [rocm-smi] Removed an initialize procedure which keeps debug_inf_loop. Seems this feature is not being used. Change-Id: I79b48387609c6233c6f05b04fb8bba66b68c2399 Signed-off-by: Charis Poag [ROCm/rocm_smi_lib commit: c3a095a180ea48f5bb8f815a5a2716918fbe308d] --- projects/rocm-smi-lib/CMakeLists.txt | 2 + projects/rocm-smi-lib/DEBIAN/postinst.in | 97 +++- projects/rocm-smi-lib/DEBIAN/prerm.in | 20 + projects/rocm-smi-lib/RPM/post.in | 102 ++++ projects/rocm-smi-lib/RPM/postun.in | 2 + projects/rocm-smi-lib/RPM/preun.in | 23 + .../include/rocm_smi/rocm_smi_common.h | 4 + .../include/rocm_smi/rocm_smi_logger.h | 224 ++++++++ .../include/rocm_smi/rocm_smi_main.h | 1 + .../include/rocm_smi/rocm_smi_utils.h | 5 + projects/rocm-smi-lib/src/rocm_smi.cc | 284 +++++++++- projects/rocm-smi-lib/src/rocm_smi_device.cc | 101 +++- projects/rocm-smi-lib/src/rocm_smi_logger.cc | 487 ++++++++++++++++++ projects/rocm-smi-lib/src/rocm_smi_main.cc | 38 +- projects/rocm-smi-lib/src/rocm_smi_monitor.cc | 6 +- projects/rocm-smi-lib/src/rocm_smi_utils.cc | 208 +++++++- 16 files changed, 1591 insertions(+), 13 deletions(-) create mode 100644 projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h create mode 100644 projects/rocm-smi-lib/src/rocm_smi_logger.cc diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 82306ad3b4..60e7f3ccf6 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -128,6 +128,7 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_kfd.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_gpu_metrics.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_logger.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc") set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h") @@ -141,6 +142,7 @@ set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_counters.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h") ## set components diff --git a/projects/rocm-smi-lib/DEBIAN/postinst.in b/projects/rocm-smi-lib/DEBIAN/postinst.in index d916aa7b85..c3176bb6c6 100755 --- a/projects/rocm-smi-lib/DEBIAN/postinst.in +++ b/projects/rocm-smi-lib/DEBIAN/postinst.in @@ -1,6 +1,99 @@ #!/bin/bash -set -e +#set -x + +do_addLogFolder() { + sudo mkdir -p /var/log/rocm_smi_lib + sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log + sudo chmod -R a+rw /var/log/rocm_smi_lib + sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log +} + +do_configureLogrotate() { + logrotate --version &>/dev/null + if [ $? -ne 0 ]; then + echo "[WARNING] Detected logrotate is not installed."\ + "ROCm-smi logs (when turned on) will not rotate properly." + return + fi + + if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then + sudo touch /etc/logrotate.d/rocm_smi.conf + sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read + # ROCm SMI logging rotation, rotates files using root user/group + # Hourly logrotation check + # Only rotates if size grew larger than 1MB + # Max of 4 rotation files, oldest will be removed + # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 + cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null +/var/log/rocm_smi_lib/ROCm-SMI-lib.log { + su root root + hourly + missingok + notifempty + rotate 4 + size 1M + copytruncate + dateext + dateformat .%Y-%m-%d_%H:%M:%S +} +EOF + # workaround: remove extra 'OURCE' text + # from rocm_smi.conf. Unsure if CMAKE, + # bash, or here document + # issue (only seen on RHEL 8.7) + sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf + fi + # check if logrotate uses system timers, Ubuntu/modern OS's do + # Several older OS's like RHEL 8.7, do not. Instead defaults + # to use daily cron jobs - see https://stackoverflow.com/a/69465677 + sudo systemctl list-timers|grep -iq logrotate + if [ $? -ne 0 ]; then + # confirm logrotate file exists in daily + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -f /etc/cron.hourly/logrotate ]; then + sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + else + echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\ + " logrotate. ROCm-smi logs (when turned on) will not rotate properly." + return + fi + else + # confirm that it's already been moved to hourly + sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly + if [ $? -ne 0 ]; then + echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\ + "ROCm-smi logs (when turned on) may not rotate properly." + fi + fi + else + # Configure systemd timers - the typical setup for modern Linux logrotation setups + if [ -f /lib/systemd/system/logrotate.timer ]; then + if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then + sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + fi + cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null +[Unit] +Description=Hourly rotation of log files +Documentation=man:logrotate(8) man:logrotate.conf(5) + +[Timer] +OnCalendar= +OnCalendar=hourly +AccuracySec=1m +Persistent=true + +[Install] +WantedBy=timers.target +EOF + sudo systemctl reenable --now logrotate.timer + else + echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\ + "ROCm-smi logs (when turned on) will not rotate properly." + fi + fi +} do_ldconfig() { # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build @@ -13,6 +106,8 @@ do_ldconfig() { case "$1" in ( configure ) do_ldconfig + do_addLogFolder + do_configureLogrotate ;; ( abort-upgrade | abort-remove | abort-deconfigure ) echo "$1" diff --git a/projects/rocm-smi-lib/DEBIAN/prerm.in b/projects/rocm-smi-lib/DEBIAN/prerm.in index bc9cc2ef33..7c23e8e990 100755 --- a/projects/rocm-smi-lib/DEBIAN/prerm.in +++ b/projects/rocm-smi-lib/DEBIAN/prerm.in @@ -2,6 +2,24 @@ set -e +rm_logFolder() { + sudo rm -rf /var/log/rocm_smi_lib +} + +return_logrotateToOrigConfig() { + if [ -f /etc/logrotate.d/rocm_smi.conf ]; then + sudo rm -rf /etc/logrotate.d/rocm_smi.conf + fi + if [ -f /etc/cron.hourly/logrotate ]; then + sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + sudo rm -rf /lib/systemd/system/logrotate.timer.backup + sudo systemctl reenable --now logrotate.timer + fi +} + rm_ldconfig() { # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build if [ "@ENABLE_LDCONFIG@" == "ON" ]; then @@ -19,6 +37,8 @@ case "$1" in ( remove | upgrade) rm_ldconfig rm_pyc + rm_logFolder + return_logrotateToOrigConfig ;; ( purge ) ;; diff --git a/projects/rocm-smi-lib/RPM/post.in b/projects/rocm-smi-lib/RPM/post.in index a7f518b81d..d1992de5ba 100755 --- a/projects/rocm-smi-lib/RPM/post.in +++ b/projects/rocm-smi-lib/RPM/post.in @@ -1,5 +1,107 @@ +#!/bin/bash +#set -x + +do_addLogFolder() { + sudo mkdir -p /var/log/rocm_smi_lib + sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log + sudo chmod -R a+rw /var/log/rocm_smi_lib + sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log +} + +do_configureLogrotate() { + logrotate --version &>/dev/null + if [ $? -ne 0 ]; then + echo "[WARNING] Detected logrotate is not installed."\ + "ROCm-smi logs (when turned on) will not rotate properly." + return + fi + + if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then + sudo touch /etc/logrotate.d/rocm_smi.conf + sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read + # ROCm SMI logging rotation, rotates files using root user/group + # Hourly logrotation check + # Only rotates if size grew larger than 1MB + # Max of 4 rotation files, oldest will be removed + # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 + cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null +/var/log/rocm_smi_lib/ROCm-SMI-lib.log { + su root root + hourly + missingok + notifempty + rotate 4 + size 1M + copytruncate + dateext + dateformat .%Y-%m-%d_%H:%M:%S +} +EOF + # workaround: remove extra 'OURCE' text + # from rocm_smi.conf. Unsure if CMAKE, + # bash, or here document + # issue (only seen on RHEL 8.7) + sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf + fi + # check if logrotate uses system timers, Ubuntu/modern OS's do + # Several older OS's like RHEL 8.7, do not. Instead defaults + # to use daily cron jobs - see https://stackoverflow.com/a/69465677 + sudo systemctl list-timers|grep -iq logrotate + if [ $? -ne 0 ]; then + # confirm logrotate file exists in daily + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -f /etc/cron.hourly/logrotate ]; then + sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + else + echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\ + " logrotate. ROCm-smi logs (when turned on) will not rotate properly." + return + fi + else + # confirm that it's already been moved to hourly + sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly + if [ $? -ne 0 ]; then + echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\ + "ROCm-smi logs (when turned on) may not rotate properly." + fi + fi + else + # Configure systemd timers - the typical setup for modern Linux logrotation setups + if [ -f /lib/systemd/system/logrotate.timer ]; then + if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then + sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + fi + cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null +[Unit] +Description=Hourly rotation of log files +Documentation=man:logrotate(8) man:logrotate.conf(5) + +[Timer] +OnCalendar= +OnCalendar=hourly +AccuracySec=1m +Persistent=true + +[Install] +WantedBy=timers.target +EOF + sudo systemctl reenable --now logrotate.timer + else + echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\ + "ROCm-smi logs (when turned on) will not rotate properly." + fi + fi +} + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build if [ "@ENABLE_LDCONFIG@" == "ON" ]; then echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf ldconfig fi + +# post install or upgrade, $i is 1 or 2 -> do these actions +if [ $1 -ge 1 ]; then + do_addLogFolder + do_configureLogrotate +fi \ No newline at end of file diff --git a/projects/rocm-smi-lib/RPM/postun.in b/projects/rocm-smi-lib/RPM/postun.in index d54cd9b2ea..0dd41d82d8 100755 --- a/projects/rocm-smi-lib/RPM/postun.in +++ b/projects/rocm-smi-lib/RPM/postun.in @@ -1,3 +1,5 @@ +#!/bin/bash + # second term originates from ENABLE_LDCONFIG = ON/OFF at package build if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations diff --git a/projects/rocm-smi-lib/RPM/preun.in b/projects/rocm-smi-lib/RPM/preun.in index 33acd7f401..612504c3f2 100755 --- a/projects/rocm-smi-lib/RPM/preun.in +++ b/projects/rocm-smi-lib/RPM/preun.in @@ -1,5 +1,28 @@ +#!/bin/bash +#set -x + +rm_logFolder() { + sudo rm -rf /var/log/rocm_smi_lib +} + +return_logrotateToOrigConfig() { + if [ -f /etc/logrotate.d/rocm_smi.conf ]; then + sudo rm -rf /etc/logrotate.d/rocm_smi.conf + fi + if [ -f /etc/cron.hourly/logrotate ]; then + sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + sudo rm -rf /lib/systemd/system/logrotate.timer.backup + sudo systemctl reenable --now logrotate.timer + fi +} + if [ $1 -le 1 ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations # remove pyc file generated by python rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ + rm_logFolder + return_logrotateToOrigConfig fi diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h index fd124d3530..bff8c8edc5 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h @@ -170,6 +170,10 @@ struct RocmSMI_env_vars { // comma delimited values. std::unordered_set enum_overrides; + // If RSMI_LOGGING is set, enables logging. + // Otherwise unset values, signify logging is turned off. + uint32_t logging_on; + // Sysfs path overrides // Env. var. RSMI_DEBUG_DRM_ROOT_OVERRIDE diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h new file mode 100644 index 0000000000..5997a6621d --- /dev/null +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h @@ -0,0 +1,224 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + * Detail Description: + * Implemented complete logging mechanism, supporting multiple logging type + * like as file based logging, console base logging etc. It also supported + * for different log types. + * + * Thread Safe logging mechanism. Compatible with G++ (Linux platform) + * + * Supported Log Type: ERROR, ALARM, ALWAYS, INFO, BUFFER, TRACE, DEBUG + * No control for ERROR, ALRAM and ALWAYS messages. These type of messages + * should be always captured -- IF logging is enabled. + * + * WARNING: Logging is controlled by users environment variable - RSMI_LOGGING. + * Enabling RSMI_LOGGING, by export RSMI_LOGGING=. No logs will + * be printed, unless RSMI_LOGGING is enabled. + * + * BUFFER log type should be use while logging raw buffer or raw messages + * Having direct interface as well as C++ Singleton inface. Can use + * whatever interface fits your needs. + */ + +#ifndef _ROCM_SMI_LOGGER_H_ +#define _ROCM_SMI_LOGGER_H_ + +// C++ Header File(s) +#include +#include +#include +#include +#include + +// POSIX Socket Header File(s) +#include + +// Code Specific Header Files(s) + + +namespace ROCmLogging { +// Direct Interface for logging into log file or console using MACRO(s) +#define LOG_ERROR(x) (ROCmLogging::Logger::getInstance()->error(x)) +#define LOG_ALARM(x) (ROCmLogging::Logger::getInstance()->alarm(x)) +#define LOG_ALWAYS(x) (ROCmLogging::Logger::getInstance()->always(x)) +#define LOG_INFO(x) (ROCmLogging::Logger::getInstance()->info(x)) +#define LOG_BUFFER(x) (ROCmLogging::Logger::getInstance()->buffer(x)) +#define LOG_TRACE(x) (ROCmLogging::Logger::getInstance()->trace(x)) +#define LOG_DEBUG(x) (ROCmLogging::Logger::getInstance()->debug(x)) + +// enum for LOG_LEVEL +typedef enum LOG_LEVEL { + DISABLE_LOG = 1, + LOG_LEVEL_INFO = 2, + LOG_LEVEL_BUFFER = 3, + LOG_LEVEL_TRACE = 4, + LOG_LEVEL_DEBUG = 5, + ENABLE_LOG = 6, +} LogLevel; + +// enum for LOG_TYPE +typedef enum LOG_TYPE { + NO_LOG = 1, + CONSOLE = 2, + FILE_LOG = 3, +} LogType; + +class Logger { + public: + static Logger* getInstance() throw(); + + Logger& operator<<(std::string &s) { + switch (this->m_LogLevel) { + case DISABLE_LOG: + break; + case LOG_LEVEL_INFO: + info(s); + break; + case LOG_LEVEL_BUFFER: + buffer(s); + break; + case LOG_LEVEL_TRACE: + trace(s); + break; + case LOG_LEVEL_DEBUG: + debug(s); + break; + case ENABLE_LOG: + always(s); + break; + default: + break; + } + return *getInstance(); + }; + + Logger &operator<<(const char* s) { + return operator<<(std::string(s)); + }; + + template Logger &operator<<(const T &v) { + std::ostringstream s; + s << v; + std::string str = s.str(); + return operator<<(str); + }; + + // Interface for Error Log + void error(const char* text) throw(); + void error(std::string& text) throw(); + void error(std::ostringstream& stream) throw(); + + // Interface for Alarm Log + void alarm(const char* text) throw(); + void alarm(std::string& text) throw(); + void alarm(std::ostringstream& stream) throw(); + + // Interface for Always Log + void always(const char* text) throw(); + void always(std::string& text) throw(); + void always(std::ostringstream& stream) throw(); + + // Interface for Buffer Log + void buffer(const char* text) throw(); + void buffer(std::string& text) throw(); + void buffer(std::ostringstream& stream) throw(); + + // Interface for Info Log + void info(const char* text) throw(); + void info(std::string& text) throw(); + void info(std::ostringstream& stream) throw(); + + // Interface for Trace log + void trace(const char* text) throw(); + void trace(std::string& text) throw(); + void trace(std::ostringstream& stream) throw(); + + // Interface for Debug log + void debug(const char* text) throw(); + void debug(std::string& text) throw(); + void debug(std::ostringstream& stream) throw(); + + // Error and Alarm log must be always enable + // Hence, there is no interfce to control error and alarm logs + + // Interfaces to control log levels + void updateLogLevel(LogLevel logLevel); + void enableAllLogLevels(); // Enable all log levels + void disableLog(); // Disable all log levels, except error and alarm + + // Interfaces to control log Types + void updateLogType(LogType logType); + void enableConsoleLogging(); + void enableFileLogging(); + std::string getLogSettings(); + bool isLoggerEnabled(); + + protected: + Logger(); + ~Logger(); + + // Wrapper function for lock/unlock + // For Extensible feature, lock and unlock should be in protected + void lock(); + void unlock(); + + std::string getCurrentTime(); + + private: + static Logger* m_Instance; + std::ofstream m_File; + bool m_loggingIsOn = false; + LogLevel m_LogLevel; + LogType m_LogType; + std::mutex m_Mutex; + std::unique_lock m_Lock{m_Mutex, std::defer_lock}; + + void logIntoFile(std::string& data); + void logOnConsole(std::string& data); + void operator=(const Logger& obj) {} + void initialize_resources(); + void destroy_resources(); +}; + +} // namespace ROCmLogging + +#endif // End of _ROCM_SMI_LOGGER_H_ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index 74aad3668b..a64adddcc5 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -114,6 +114,7 @@ class RocmSMI { int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); void printEnvVarInfo(void); + bool isLoggingOn(void); static const std::map devInfoTypesStrings; private: diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index f2f8d6ab8f..087ca5faa9 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -68,6 +68,7 @@ namespace smi { pthread_mutex_t *GetMutex(uint32_t dv_ind); int SameFile(const std::string fileA, const std::string fileB); bool FileExists(char const *filename); +std::vector globFilesExist(const std::string& filePattern); int isRegularFile(std::string fname, bool *is_reg); int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); @@ -91,6 +92,10 @@ rsmi_status_t GetDevBinaryBlob(amd::smi::DevInfoTypes type, uint32_t dv_ind, std::size_t b_size, void* p_binary_data); rsmi_status_t ErrnoToRsmiStatus(int err); +std::string getRSMIStatusString(rsmi_status_t ret); +std::tuple getSystemDetails(void); +void logSystemDetails(void); struct pthread_wrap { public: diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index c699701ddf..0eea621c88 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -73,8 +74,10 @@ #include "rocm_smi/rocm_smi_counters.h" #include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_io_link.h" - #include "rocm_smi/rocm_smi64Config.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3f; @@ -521,6 +524,9 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, rsmi_status_t ret; std::string feature_line; std::string tmp_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(enabled_blks) @@ -528,6 +534,10 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, ret = get_dev_value_line(amd::smi::kDevErrCntFeatures, dv_ind, &feature_line); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", returning get_dev_value_line() response = " + << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); return ret; } @@ -543,6 +553,11 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, *enabled_blks = strtoul(tmp_str.c_str(), nullptr, 16); assert(errno == 0); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", returning strtoul() response = " + << amd::smi::getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno)); + LOG_TRACE(ss); + return amd::smi::ErrnoToRsmiStatus(errno); CATCH } @@ -564,10 +579,17 @@ static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED, rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_ras_err_state_t *state) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(state) if (!is_power_of_2(block)) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", ret was not power of 2 " + << "-> reporting RSMI_STATUS_INVALID_ARGS"; + LOG_ERROR(ss); return RSMI_STATUS_INVALID_ARGS; } rsmi_status_t ret; @@ -578,15 +600,26 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, ret = rsmi_dev_ecc_enabled_get(dv_ind, &features_mask); if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", rsmi_dev_ecc_enabled_get() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", returning rsmi_dev_ecc_enabled_get() response = " + << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); return ret; } *state = (features_mask & block) ? RSMI_RAS_ERR_STATE_ENABLED : RSMI_RAS_ERR_STATE_DISABLED; + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting RSMI_STATUS_SUCCESS"; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -596,8 +629,11 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec) { std::vector val_vec; rsmi_status_t ret; + std::ostringstream ss; TRY + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_VAR(ec, block) @@ -632,6 +668,10 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, break; default: + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED" + << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -640,9 +680,17 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, ret = GetDevValueVec(type, dv_ind, &val_vec); if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); return ret; } @@ -661,6 +709,9 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, assert(junk == "ce:"); fs2 >> ec->correctable_err; + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret);; + LOG_TRACE(ss); return ret; CATCH } @@ -668,6 +719,9 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); GET_DEV_AND_KFDNODE_FROM_INDX CHK_API_SUPPORT_ONLY(bdfid, RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) @@ -690,6 +744,9 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { (*bdfid) &= 0xFFFF; // Clear out the old 16 bit domain *bdfid |= (domain & 0xFFFFFFFF) << 32; + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting RSMI_STATUS_SUCCESS"; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -746,32 +803,58 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + rsmi_status_t ret; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) - return get_id(dv_ind, amd::smi::kDevDevID, id); + + ret = get_id(dv_ind, amd::smi::kDevDevID, id); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(ss); + return ret; } rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) { TRY + std::ostringstream ss; + rsmi_status_t ret; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) - return get_id(dv_ind, amd::smi::kDevDevProdNum, id); + ret = get_id(dv_ind, amd::smi::kDevDevProdNum, id); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(ss); + return ret; CATCH } rsmi_status_t rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) return get_id(dv_ind, amd::smi::kDevSubSysDevID, id); } rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) return get_id(dv_ind, amd::smi::kDevVendorID, id); } rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) return get_id(dv_ind, amd::smi::kDevSubSysVendorID, id); } @@ -780,6 +863,9 @@ rsmi_status_t rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) { TRY std::string val_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(perf) DEVICE_MUTEX @@ -842,6 +928,9 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { TRY std::string val_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(od) DEVICE_MUTEX @@ -869,6 +958,9 @@ rsmi_status_t rsmi_dev_mem_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { TRY std::string val_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(od) DEVICE_MUTEX @@ -894,6 +986,9 @@ rsmi_dev_mem_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if (dv_ind < 0) { return RSMI_STATUS_INVALID_ARGS; } @@ -903,6 +998,9 @@ rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS if (od > kMaxOverdriveLevel) { @@ -915,12 +1013,18 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { rsmi_status_t rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_level) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); return rsmi_dev_perf_level_set_v1(static_cast(dv_ind), perf_level); } rsmi_status_t rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS if (perf_level > RSMI_DEV_PERF_LEVEL_LAST) { @@ -1194,6 +1298,9 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, rsmi_clk_type_t clkType) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); assert(minclkvalue < maxclkvalue); std::string min_sysvalue, max_sysvalue; @@ -1246,6 +1353,9 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, rsmi_clk_type_t clkType) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); std::string sysvalue; std::map ClkStateMap = { @@ -1301,6 +1411,9 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, uint64_t clkvalue, uint64_t voltvalue) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); DEVICE_MUTEX @@ -1472,6 +1585,9 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, rsmi_frequencies_t *f) { TRY amd::smi::DevInfoTypes dev_type; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_VAR(f, clk_type) @@ -1506,6 +1622,9 @@ rsmi_status_t rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, uint64_t *fw_version) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_VAR(fw_version, block) std::string val_str; @@ -1609,6 +1728,9 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_frequencies_t freqs; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -1953,6 +2075,9 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t ret; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(name) if (len == 0) { @@ -1974,6 +2099,9 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t rsmi_dev_brand_get(uint32_t dv_ind, char *brand, uint32_t len) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(brand) if (len == 0) { return RSMI_STATUS_INVALID_ARGS; @@ -2020,6 +2148,9 @@ rsmi_dev_brand_get(uint32_t dv_ind, char *brand, uint32_t len) { rsmi_status_t rsmi_dev_vram_vendor_get(uint32_t dv_ind, char *brand, uint32_t len) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(brand) if (len == 0) { @@ -2050,6 +2181,9 @@ rsmi_dev_subsystem_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t ret; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(name) if (len == 0) { @@ -2068,6 +2202,9 @@ rsmi_dev_drm_render_minor_get(uint32_t dv_ind, uint32_t *minor) { rsmi_status_t ret; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(minor) DEVICE_MUTEX @@ -2081,6 +2218,9 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t ret; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(name) assert(len > 0); @@ -2098,6 +2238,9 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(b) DEVICE_MUTEX @@ -2114,6 +2257,9 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { rsmi_pcie_bandwidth_t bws; TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX ret = rsmi_dev_pci_bandwidth_get(dv_ind, &bws); @@ -2153,6 +2299,9 @@ rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; std::string val_str; @@ -2189,6 +2338,9 @@ rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t *temperature) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; amd::smi::MonitorTypes mon_type; @@ -2309,6 +2461,9 @@ rsmi_status_t rsmi_dev_volt_metric_get(uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t *voltage) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; amd::smi::MonitorTypes mon_type; @@ -2369,6 +2524,9 @@ rsmi_dev_volt_metric_get(uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_status_t rsmi_dev_fan_speed_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; @@ -2387,6 +2545,9 @@ rsmi_dev_fan_speed_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed) { rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices @@ -2406,6 +2567,9 @@ rsmi_status_t rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices @@ -2424,6 +2588,9 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { rsmi_status_t ret; uint64_t max_speed; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -2461,6 +2628,9 @@ rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *max_speed) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices CHK_SUPPORT_SUBVAR_ONLY(max_speed, sensor_ind) DEVICE_MUTEX @@ -2475,6 +2645,9 @@ rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); DEVICE_MUTEX CHK_SUPPORT_NAME_ONLY(odv) rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv); @@ -2486,6 +2659,9 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -2502,6 +2678,9 @@ rsmi_dev_gpu_reset(int32_t dv_ind) { rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY((num_regions == nullptr || buffer == nullptr) ? nullptr : num_regions) @@ -2519,6 +2698,9 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_power_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); (void)sensor_ind; // Not used yet // ++sensor_ind; // power sysfs files have 1-based indices @@ -2536,6 +2718,9 @@ rsmi_dev_power_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) { rsmi_status_t rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // power sysfs files have 1-based indices @@ -2553,6 +2738,9 @@ rsmi_status_t rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power, float *counter_resolution, uint64_t *timestamp) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; rsmi_gpu_metrics_t gpu_metrics; @@ -2580,6 +2768,9 @@ rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power, rsmi_status_t rsmi_dev_power_cap_default_get(uint32_t dv_ind, uint64_t *default_cap) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); uint32_t sensor_ind = 1; // power sysfs files have 1-based indices CHK_SUPPORT_SUBVAR_ONLY(default_cap, sensor_ind) @@ -2596,6 +2787,9 @@ rsmi_dev_power_cap_default_get(uint32_t dv_ind, uint64_t *default_cap) { rsmi_status_t rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // power sysfs files have 1-based indices CHK_SUPPORT_SUBVAR_ONLY(cap, sensor_ind) @@ -2613,6 +2807,9 @@ rsmi_status_t rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *max, uint64_t *min) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); ++sensor_ind; // power sysfs files have 1-based indices CHK_SUPPORT_SUBVAR_ONLY((min == nullptr || max == nullptr ?nullptr : min), @@ -2635,6 +2832,9 @@ rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { TRY rsmi_status_t ret; uint64_t min, max; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -2663,6 +2863,9 @@ rsmi_status_t rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved, rsmi_power_profile_status_t *status) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); (void)reserved; CHK_SUPPORT_NAME_ONLY(status) @@ -2677,6 +2880,9 @@ rsmi_status_t rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy, rsmi_power_profile_preset_masks_t profile) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS (void)dummy; @@ -2692,6 +2898,9 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, TRY rsmi_status_t ret; amd::smi::DevInfoTypes mem_type_file; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_VAR(total, mem_type) @@ -2725,6 +2934,9 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, TRY rsmi_status_t ret; amd::smi::DevInfoTypes mem_type_file; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_VAR(used, mem_type) @@ -2757,6 +2969,9 @@ rsmi_status_t rsmi_dev_memory_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(busy_percent) @@ -2905,6 +3120,9 @@ rsmi_status_t rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent) { TRY std::string val_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(busy_percent) @@ -2975,6 +3193,9 @@ rsmi_utilization_count_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(vbios) if (len == 0) { @@ -3071,6 +3292,9 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str, rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, char *serial_num, uint32_t len) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(serial_num) if (len == 0) { return RSMI_STATUS_INVALID_ARGS; @@ -3102,6 +3326,9 @@ rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(counter) rsmi_status_t ret; @@ -3117,6 +3344,9 @@ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { TRY rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(unique_id) @@ -3130,6 +3360,9 @@ rsmi_status_t rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, rsmi_event_handle_t *evnt_handle) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS // Note we don't need to pass in the variant to CHK_SUPPORT_VAR because @@ -3150,6 +3383,9 @@ rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, rsmi_status_t rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if (evnt_handle == 0) { return RSMI_STATUS_INVALID_ARGS; @@ -3267,6 +3503,9 @@ rsmi_counter_available_counters_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); DEVICE_MUTEX GET_DEV_FROM_INDX @@ -3368,6 +3607,9 @@ rsmi_status_t rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, rsmi_retired_page_record_t *records) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); rsmi_status_t ret; CHK_SUPPORT_NAME_ONLY(num_pages) @@ -3466,6 +3708,9 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_status_t rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(status) rsmi_status_t ret; @@ -3503,6 +3748,9 @@ rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) { rsmi_status_t rsmi_dev_xgmi_error_reset(uint32_t dv_ind) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); DEVICE_MUTEX rsmi_status_t ret; @@ -3518,6 +3766,9 @@ rsmi_dev_xgmi_error_reset(uint32_t dv_ind) { rsmi_status_t rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if (hive_id == nullptr) { return RSMI_STATUS_INVALID_ARGS; @@ -3817,6 +4068,9 @@ rsmi_status_t rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, uint32_t len) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if ((len == 0) || (compute_partition == nullptr)) { return RSMI_STATUS_INVALID_ARGS; } @@ -3863,6 +4117,9 @@ rsmi_status_t rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX std::string newComputePartitionStr @@ -3952,6 +4209,9 @@ static rsmi_status_t get_nps_mode(uint32_t dv_ind, std::string &nps_mode) { rsmi_status_t rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX bool isCorrectDevice = false; @@ -4022,6 +4282,9 @@ rsmi_status_t rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if ((len == 0) || (nps_mode == nullptr)) { return RSMI_STATUS_INVALID_ARGS; } @@ -4045,6 +4308,9 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX GET_DEV_FROM_INDX @@ -4066,6 +4332,9 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { rsmi_status_t rsmi_dev_nps_mode_reset(uint32_t dv_ind) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX GET_DEV_FROM_INDX @@ -4094,6 +4363,9 @@ rsmi_status_t rsmi_dev_supported_func_iterator_open(uint32_t dv_ind, rsmi_func_id_iter_handle_t *handle) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); GET_DEV_FROM_INDX if (handle == nullptr) { @@ -4136,6 +4408,9 @@ rsmi_dev_supported_variant_iterator_open( rsmi_func_id_iter_handle_t parent_iter, rsmi_func_id_iter_handle_t *var_iter) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if (var_iter == nullptr || parent_iter->id_type == SUBVARIANT_ITER) { return RSMI_STATUS_INVALID_ARGS; @@ -4206,6 +4481,9 @@ rsmi_dev_supported_variant_iterator_open( rsmi_status_t rsmi_dev_supported_func_iterator_close(rsmi_func_id_iter_handle_t *handle) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); if (handle == nullptr) { return RSMI_STATUS_INVALID_ARGS; diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 175fba12d3..311aee3edd 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -41,7 +41,6 @@ * */ - #include #include #include @@ -59,6 +58,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" @@ -66,8 +66,11 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_kfd.h" +#include "rocm_smi/rocm_smi_logger.h" #include "shared_mutex.h" // NOLINT +using namespace ROCmLogging; + namespace amd { namespace smi { @@ -570,6 +573,7 @@ int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) { template int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { auto sysfs_path = path_; + std::ostringstream ss; #ifdef DEBUG if (env_->path_DRM_root_override @@ -587,18 +591,35 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { int ret = isRegularFile(sysfs_path, ®_file); if (ret != 0) { + ss << "File did not exist - SYSFS file (" << sysfs_path + << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << "), returning " << std::to_string(ret); + LOG_ERROR(ss); return ret; } if (!reg_file) { + ss << "File is not a regular file - SYSFS file (" << sysfs_path << ") for " + << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")," + << " returning ENOENT (" << std::strerror(ENOENT) << ")"; + LOG_ERROR(ss); return ENOENT; } fs->open(sysfs_path); if (!fs->is_open()) { + ss << "Could not open - SYSFS file (" << sysfs_path << ") for " + << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), " + << ", returning " << std::to_string(errno) << " (" + << std::strerror(errno) << ")"; + LOG_ERROR(ss); return errno; } + ss << "Successfully opened SYSFS file (" << sysfs_path + << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ")"; + LOG_INFO(ss); return 0; } @@ -606,11 +627,16 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { std::ifstream fs; std::string line; int ret = 0; + std::ostringstream ss; assert(retStr != nullptr); ret = openDebugFileStream(type, &fs); if (ret != 0) { + ss << "Could not read debugInfoStr for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type)<< "), returning " + << std::to_string(ret); + LOG_ERROR(ss); return ret; } @@ -621,21 +647,34 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { fs.close(); + ss << "Successfully read debugInfoStr for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type)<< "), retString= " << *retStr; + LOG_INFO(ss); + return 0; } int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { std::ifstream fs; int ret = 0; + std::ostringstream ss; assert(retStr != nullptr); ret = openSysfsFileStream(type, &fs); if (ret != 0) { + ss << "Could not read device info string for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type)<< "), returning " + << std::to_string(ret); + LOG_ERROR(ss); return ret; } fs >> *retStr; + std::string info = "Successfully read device info string for DevInfoType (" + + RocmSMI::devInfoTypesStrings.at(type) + "): " + + *retStr; + LOG_INFO(info); fs.close(); return 0; @@ -645,17 +684,30 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { auto tempPath = path_; std::ofstream fs; int ret; + std::ostringstream ss; fs.rdbuf()->pubsetbuf(0,0); ret = openSysfsFileStream(type, &fs, valStr.c_str()); if (ret != 0) { + ss << "Could not write device info string (" << valStr + << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << "), returning " << std::to_string(ret); + LOG_ERROR(ss); return ret; } // We'll catch any exceptions in rocm_smi.cc code. if (fs << valStr) { + ss << "Successfully wrote device info string (" << valStr + << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << "), returning RSMI_STATUS_SUCCESS"; + LOG_INFO(ss); ret = RSMI_STATUS_SUCCESS; } else { + ss << "Could not write device info string (" << valStr + << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << "), returning RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); ret = RSMI_STATUS_NOT_SUPPORTED; } fs.close(); @@ -719,15 +771,23 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { int ret; std::ifstream fs; + std::ostringstream ss; assert(line != nullptr); ret = openSysfsFileStream(type, &fs); if (ret != 0) { + ss << "Could not read DevInfoLine for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ")"; + LOG_ERROR(ss); return ret; } std::getline(fs, *line); + ss << "Successfully read DevInfoLine for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << "), returning *line = " + << *line; + LOG_INFO(ss); return 0; } @@ -735,20 +795,36 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data) { auto sysfs_path = path_; + std::ostringstream ss; FILE *ptr; sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { + ss << "Could not read DevInfoBinary for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ")" + << " - SYSFS (" << sysfs_path << ")" + << ", returning " << std::to_string(errno) << " (" + << std::strerror(errno) << ")"; + LOG_ERROR(ss); return errno; } size_t num = fread(p_binary_data, b_size, 1, ptr); fclose(ptr); if ((num*b_size) != b_size) { + ss << "Could not read DevInfoBinary for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << sysfs_path << "), binary size error, " + << ", returning ENOENT (" << std::strerror(ENOENT) << ")"; + LOG_ERROR(ss); return ENOENT; } + ss << "Successfully read DevInfoBinary for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << sysfs_path << "), returning binaryData = " << p_binary_data; + LOG_INFO(ss); return 0; } @@ -757,6 +833,8 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, std::string line; int ret; std::ifstream fs; + std::string allLines; + std::ostringstream ss; assert(retVec != nullptr); @@ -770,6 +848,10 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, } if (retVec->size() == 0) { + ss << "Read devInfoMultiLineStr for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ")" + << ", but contained no string lines"; + LOG_INFO(ss); return 0; } // Remove any *trailing* empty (whitespace) lines @@ -777,6 +859,23 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { retVec->pop_back(); } + + // allow logging output of multiline strings + for (auto l: *retVec) { + allLines += "\n" + l; + } + + if (!allLines.empty()) { + ss << "Successfully read devInfoMultiLineStr for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ") " + << ", returning lines read = " << allLines; + LOG_INFO(ss); + } else { + ss << "Read devInfoMultiLineStr for DevInfoType (" + << RocmSMI::devInfoTypesStrings.at(type) << ")" + << ", but lines were empty"; + LOG_INFO(ss); + } return 0; } diff --git a/projects/rocm-smi-lib/src/rocm_smi_logger.cc b/projects/rocm-smi-lib/src/rocm_smi_logger.cc new file mode 100644 index 0000000000..c900c613c1 --- /dev/null +++ b/projects/rocm-smi-lib/src/rocm_smi_logger.cc @@ -0,0 +1,487 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + * Detail Description: + * Implemented complete logging mechanism, supporting multiple logging type + * like as file based logging, console base logging etc. It also supported + * for different log types. + * + * Thread Safe logging mechanism. Compatible with G++ (Linux platform) + * + * Supported Log Type: ERROR, ALARM, ALWAYS, INFO, BUFFER, TRACE, DEBUG + * No control for ERROR, ALRAM and ALWAYS messages. These type of messages + * should be always captured -- IF logging is enabled. + * + * WARNING: Logging is controlled by users environment variable - RSMI_LOGGING. + * Enabling RSMI_LOGGING, by export RSMI_LOGGING=. No logs will + * be printed, unless RSMI_LOGGING is enabled. + * + * BUFFER log type should be use while logging raw buffer or raw messages + * Having direct interface as well as C++ Singleton inface. Can use + * whatever interface fits your needs. + */ + +// C++ Header File(s) +#include +#include +#include +#include +#include +#include + +// Code Specific Header Files(s) +#include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_utils.h" + +using namespace ROCmLogging; + +Logger* Logger::m_Instance = nullptr; + +// Log file name +// WARNING: File name should be changed here and +// pre/post install/remove/upgrade scripts. Changing +// in one place will cause a mismatch in these scripts, +// files may not have proper permissions, and logrotate +// would not function properly. +const std::string logPath = "/var/log/rocm_smi_lib/"; +const std::string logBaseFName = "ROCm-SMI-lib"; +const std::string logExtension = ".log"; +const std::string logFileName = logPath + logBaseFName + logExtension; + +Logger::Logger() { + initialize_resources(); +} + +Logger::~Logger() { + if (m_loggingIsOn) { + destroy_resources(); + } +} + +Logger* Logger::getInstance() throw() { + if (m_Instance == nullptr) { + m_Instance = new Logger(); + } + return m_Instance; +} + +void Logger::lock() { + m_Lock.lock(); +} + +void Logger::unlock() { + m_Lock.unlock(); +} + +void Logger::logIntoFile(std::string& data) { + lock(); + if(!m_File.is_open()) { + initialize_resources(); + if (!m_File.is_open()) { + std::cout << "WARNING: re-initializing resources was unsuccessfull." + <<" Unable to print the following message." << std::endl; + logOnConsole(data); + unlock(); + return; + } + } + m_File << getCurrentTime() << " " << data << std::endl; + unlock(); +} + +void Logger::logOnConsole(std::string& data) { + std::cout << getCurrentTime() << " " << data << std::endl; +} + +// Returns: In string format, YY-MM-DD HH:MM:SS.microseconds +std::string Logger::getCurrentTime(void) { + using namespace std::chrono; + std::string currentTime; + + // get current time + auto now = system_clock::now(); + + // get number of milliseconds for the current second + // (remainder after division into seconds) + auto ms = duration_cast(now.time_since_epoch()) % 1000000; + + // convert to std::time_t in order to convert to std::tm (broken time) + auto timer = system_clock::to_time_t(now); + + // convert to broken time + std::tm bt = *std::localtime(&timer); + + std::ostringstream oss; + + // YY-MM-DD HH:MM:SS.microseconds + oss << std::put_time(&bt, "%F %T"); + oss << '.' << std::setfill('0') << std::setw(4) << ms.count(); + currentTime = oss.str(); + return currentTime; +} + +// Interface for Error Log +void Logger::error(const char* text) throw() { + // By default, logging is disabled + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[ERROR]: "); + data.append(text); + + // ERROR must be capture + if (m_LogType == FILE_LOG) { + logIntoFile(data); + } else if (m_LogType == CONSOLE) { + logOnConsole(data); + } +} + +void Logger::error(std::string& text) throw() { + error(text.data()); +} + +void Logger::error(std::ostringstream& stream) throw() { + std::string text = stream.str(); + error(text.data()); + stream.str(""); +} + +// Interface for Alarm Log +void Logger::alarm(const char* text) throw() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[ALARM]: "); + data.append(text); + + // ALARM must be capture + if (m_LogType == FILE_LOG) { + logIntoFile(data); + } else if (m_LogType == CONSOLE) { + logOnConsole(data); + } +} + +void Logger::alarm(std::string& text) throw() { + alarm(text.data()); +} + +void Logger::alarm(std::ostringstream& stream) throw() { + std::string text = stream.str(); + alarm(text.data()); + stream.str(""); +} + +// Interface for Always Log +void Logger::always(const char* text) throw() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[ALWAYS]: "); + data.append(text); + + // No check for ALWAYS logs + if (m_LogType == FILE_LOG) { + logIntoFile(data); + } else if (m_LogType == CONSOLE) { + logOnConsole(data); + } +} + +void Logger::always(std::string& text) throw() { + always(text.data()); +} + +void Logger::always(std::ostringstream& stream) throw() { + std::string text = stream.str(); + always(text.data()); + stream.str(""); +} + +// Interface for Buffer Log +void Logger::buffer(const char* text) throw() { + // Buffer is the special case. So don't add log level + // and timestamp in the buffer message. Just log the raw bytes. + if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_BUFFER)) { + lock(); + if(!m_File.is_open()) { + initialize_resources(); + if (!m_File.is_open()) { + std::cout << "WARNING: re-initializing resources was unsuccessfull." + <<" Unable to print the following message." << std::endl; + std::string txtStr(text); + std::cout << txtStr << std::endl; + unlock(); + return; + } + } + m_File << text << std::endl; + unlock(); + } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_BUFFER)) { + std::cout << text << std::endl; + } +} + +void Logger::buffer(std::string& text) throw() { + buffer(text.data()); +} + +void Logger::buffer(std::ostringstream& stream) throw() { + std::string text = stream.str(); + buffer(text.data()); + stream.str(""); +} + +// Interface for Info Log +void Logger::info(const char* text) throw() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[INFO]: "); + data.append(text); + + if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_INFO)) { + logIntoFile(data); + } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) { + logOnConsole(data); + } +} + +void Logger::info(std::string& text) throw() { + info(text.data()); +} + +void Logger::info(std::ostringstream& stream) throw() { + std::string text = stream.str(); + info(text.data()); + stream.str(""); +} + +// Interface for Trace Log +void Logger::trace(const char* text) throw() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[TRACE]: "); + data.append(text); + + if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_TRACE)) { + logIntoFile(data); + } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) { + logOnConsole(data); + } +} + +void Logger::trace(std::string& text) throw() { + trace(text.data()); +} + +void Logger::trace(std::ostringstream& stream) throw() { + std::string text = stream.str(); + trace(text.data()); + stream.str(""); +} + +// Interface for Debug Log +void Logger::debug(const char* text) throw() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + if (m_loggingIsOn == false) { + return; + } + + std::string data; + data.append("[DEBUG]: "); + data.append(text); + + if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_DEBUG)) { + logIntoFile(data); + } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) { + logOnConsole(data); + } +} + +void Logger::debug(std::string& text) throw() { + debug(text.data()); +} + +void Logger::debug(std::ostringstream& stream) throw() { + std::string text = stream.str(); + debug(text.data()); + stream.str(""); +} + +// Interfaces to control log levels +void Logger::updateLogLevel(LogLevel logLevel) { + m_LogLevel = logLevel; +} + +void Logger::enableAllLogLevels() { + m_LogLevel = ENABLE_LOG; +} + +// Disable all log levels, except error and alarm +void Logger::disableLog() { + m_LogLevel = DISABLE_LOG; +} + +// Interfaces to control log Types +void Logger::updateLogType(LogType logType) { + m_LogType = logType; +} + +void Logger::enableConsoleLogging() { + m_LogType = CONSOLE; +} + +void Logger::enableFileLogging() { + m_LogType = FILE_LOG; +} + +// Returns a string of details on current log settings +std::string Logger::getLogSettings() { + std::string logSettings = ""; + + if (m_File.is_open()) { + logSettings += "OpenStatus = File (" + logFileName + ") is open"; + } else { + logSettings += "OpenStatus = File (" + logFileName + ") is not open"; + } + logSettings += ", "; + + switch (m_LogType) { + case NO_LOG: + logSettings += "LogType = NO_LOG"; + break; + case FILE_LOG: + logSettings += "LogType = FILE_LOG"; + break; + case CONSOLE: + logSettings += "LogType = CONSOLE"; + break; + default: + logSettings += "LogType = "; + } + logSettings += ", "; + + switch (m_LogLevel) { + case DISABLE_LOG: + logSettings += "LogLevel = DISABLE_LOG"; + break; + case LOG_LEVEL_INFO: + logSettings += "LogLevel = LOG_LEVEL_INFO"; + break; + case LOG_LEVEL_BUFFER: + logSettings += "LogLevel = LOG_LEVEL_BUFFER"; + break; + case LOG_LEVEL_TRACE: + logSettings += "LogLevel = LOG_LEVEL_TRACE"; + break; + case LOG_LEVEL_DEBUG: + logSettings += "LogLevel = LOG_LEVEL_DEBUG"; + break; + case ENABLE_LOG: + logSettings += "LogLevel = ENABLE_LOG"; + break; + default: + logSettings += "LogLevel = "; + } + + return logSettings; +} + +// Returns current reported enabled logging state. State is controlled by +// user's environment variable RSMI_LOGGING. +bool Logger::isLoggerEnabled() { + return m_loggingIsOn; +} + +void Logger::initialize_resources() { + // By default, logging is disabled (ie. no RSMI_LOGGING) + // The check below allows us to toggle logging through RSMI_LOGGING + // set or unset + m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn(); + if (m_loggingIsOn == false) { + return; + } + m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); + m_LogLevel = LOG_LEVEL_TRACE; + m_LogType = FILE_LOG; + if (!m_File.is_open()) { + std::cout << "WARNING: Issue opening log file (" << logFileName + << ") to write." << std::endl; + } + if (m_File.fail()) { + std::cout << "WARNING: Failed opening log file." << std::endl; + } + chmod(logFileName.c_str(), S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH); +} + +void Logger::destroy_resources() { + m_File.close(); +} diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 349f6030ef..72b0d5d5f6 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -65,6 +65,9 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_kfd.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; static const char *kPathDRMRoot = "/sys/class/drm"; static const char *kPathHWMonRoot = "/sys/class/hwmon"; @@ -300,6 +303,16 @@ RocmSMI::Initialize(uint64_t flags) { uint32_t ret; int i_ret; + LOG_ALWAYS("=============== ROCM SMI initialize ================"); + Logger::getInstance()->enableAllLogLevels(); + // Leaving below to allow developers to check current log settings + // std::string logSettings = Logger::getInstance()->getLogSettings(); + // std::cout << "Current log settings:\n" << logSettings << std::endl; + + if (Logger::getInstance()->isLoggerEnabled()) { + logSystemDetails(); + } + assert(ref_count_ == 1); if (ref_count_ != 1) { throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, @@ -314,8 +327,6 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // printEnvVarInfo(); - while (env_vars_.debug_inf_loop) {} - while (std::string(kAMDMonitorTypes[i]) != "") { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; @@ -443,6 +454,18 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) { return 0; } +// provides a way to get env variable detail in both debug & release +// helps enable full logging +static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) { + bool isLoggingEnabled = false; + ev_str = getenv(ev_str); + + if (ev_str != nullptr) { + isLoggingEnabled = true; + } + return isLoggingEnabled; +} + static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { std::unordered_set returnSet; #ifndef DEBUG @@ -470,6 +493,7 @@ static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { // Get and store env. variables in this method void RocmSMI::GetEnvVariables(void) { + env_vars_.logging_on = getRSMIEnvVar_LoggingEnabled("RSMI_LOGGING"); #ifndef DEBUG (void)GetEnvVarUInteger(nullptr); // This is to quiet release build warning. env_vars_.debug_output_bitfield = 0; @@ -492,6 +516,11 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) { return env_vars_; } +bool RocmSMI::isLoggingOn(void) { + GetEnvVariables(); + return this->env_vars_.logging_on; +} + void RocmSMI::printEnvVarInfo(void) { std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " << ((env_vars_.debug_output_bitfield == 0) ? "" @@ -511,8 +540,11 @@ void RocmSMI::printEnvVarInfo(void) { << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " << ((env_vars_.debug_inf_loop == 0) ? "" - : std::to_string(env_vars_.debug_output_bitfield)) + : std::to_string(env_vars_.debug_inf_loop)) << std::endl; + bool isLoggingOn = (env_vars_.logging_on) ? true : false; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " + << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; if (env_vars_.enum_overrides.empty()) { std::cout << "}" << std::endl; diff --git a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc index b493113a54..7d49ef0711 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc @@ -57,6 +57,9 @@ #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; namespace amd { namespace smi { @@ -316,7 +319,8 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, std::string sysfs_path = MakeMonitorPath(type, sensor_id); DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr) - return ReadSysfsStr(sysfs_path, val); + int ret = ReadSysfsStr(sysfs_path, val); + return ret; } int32_t diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 1d22d5d5c7..70425d78c3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include #include @@ -60,6 +62,9 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; namespace amd { namespace smi { @@ -97,6 +102,54 @@ bool FileExists(char const *filename) { return (stat(filename, &buf) == 0); } +static void debugFilesDiscovered(std::vector files) { + std::ostringstream ss; + int numberOfFilesFound = static_cast(files.size()); + ss << "fileName.size() = " << numberOfFilesFound + << "; Files discovered = {"; + if(numberOfFilesFound > 0) { + for (auto it = begin(files); it != end(files); ++it) { + auto nextElement = std::next(it); + if (nextElement != files.end()) { + ss << *it << ", "; + } else { + ss << *it; + } + } + } else { + ss << ""; + } + ss << "}"; + LOG_DEBUG(ss); +} + +// Input: string filePattern = can put in * file searches (see example) +// example: globFilesExist("/etc/*release") +// Return a vector containing file paths that matched +// You can obtain if files exist by doing globFilesExist(...).size() > 0 +std::vector globFilesExist(const std::string& filePattern) { + std::ostringstream ss; + std::vector fileNames; + glob_t result_glob; + memset(&result_glob, 0, sizeof(result_glob)); + + if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) { + globfree(&result_glob); + // Leaving below to help debug issues discovering future glob file searches + // debugFilesDiscovered(fileNames); + return fileNames; + } + + for(size_t i = 0; i < result_glob.gl_pathc; ++i) { + fileNames.push_back(std::string(result_glob.gl_pathv[i])); + } + globfree(&result_glob); + + // Leaving below to help debug issues discovering future glob file searches + // debugFilesDiscovered(fileNames); + return fileNames; +} + int isRegularFile(std::string fname, bool *is_reg) { struct stat file_stat; int ret; @@ -114,22 +167,30 @@ int isRegularFile(std::string fname, bool *is_reg) { int WriteSysfsStr(std::string path, std::string val) { std::ofstream fs; int ret = 0; + std::ostringstream ss; fs.open(path); if (!fs.is_open()) { ret = errno; errno = 0; + ss << "Could not write/open SYSFS file (" << path << ") string = " << val + << ", returning " << std::to_string(ret) << " (" + << std::strerror(ret) << ")"; + LOG_ERROR(ss); return ret; } fs << val; fs.close(); + ss << "Successfully wrote to SYSFS file (" << path << ") string = " << val; + LOG_INFO(ss); return ret; } int ReadSysfsStr(std::string path, std::string *retStr) { std::stringstream ss; int ret = 0; + std::ostringstream oss; assert(retStr != nullptr); @@ -139,6 +200,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) { if (!fs.is_open()) { ret = errno; errno = 0; + oss << "Could not read SYSFS file (" << path << ")" + << ", returning " << std::to_string(ret) << " (" + << std::strerror(ret) << ")"; + LOG_ERROR(oss); return ret; } ss << fs.rdbuf(); @@ -148,6 +213,9 @@ int ReadSysfsStr(std::string path, std::string *retStr) { retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'), retStr->end()); + oss << "Successfully read SYSFS file (" << path << ")" + << ", returning str = " << *retStr; + LOG_INFO(oss); return ret; } @@ -268,6 +336,20 @@ std::string trim(const std::string &s) { return s; } +// Given original string and string to remove (removeMe) +// Return will provide the resulting modified string with the removed string(s) +std::string removeString(const std::string origStr, + const std::string &removeMe) { + std::string modifiedStr = origStr; + std::string::size_type l = removeMe.length(); + for (std::string::size_type i = modifiedStr.find(removeMe); + i != std::string::npos; + i = modifiedStr.find(removeMe)) { + modifiedStr.erase(i, l); + } + return modifiedStr; +} + // defaults to trim stdOut std::pair executeCommand(std::string command, bool stdOut) { char buffer[128]; @@ -378,10 +460,10 @@ std::vector getListOfAppTmpFiles() { return tmpFiles; } -// Reads a temporary file in path provided +// Reads a file in path provided // If file does not exist, returns an empty string // If file exists, returns content (which could be an empty string) -std::string readTemporaryFile(std::string path) { +std::string readFile(std::string path) { std::string fileContent; std::ifstream inFileStream(path); if (inFileStream.is_open()) { @@ -390,13 +472,32 @@ std::string readTemporaryFile(std::string path) { return fileContent; } +// Reads a file in path provided +// If file does not exist, returns an empty vector +// If file exists, returns content (each line put into a vector; which +// could be an empty string) +std::vector readEntireFile(std::string path) { + std::vector fileContent; + std::ifstream inFileStream(path); + if (inFileStream.is_open()) { + std::string line; + while (std::getline(inFileStream, line)) { + std::istringstream ss(line); + if(line.size() > 0) { + fileContent.push_back(line); + } + } + } + return fileContent; +} + // Used to debug application temporary files (identified by kTmpFilePrefix) // and their content void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); if (tmpFiles.empty() == false) { for (auto &x: tmpFiles) { - std::string out = readTemporaryFile(x); + std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x << "; Contained content: " << out << std::endl; } @@ -406,6 +507,22 @@ void displayAppTmpFilesContent() { } } +// Used to debug vector string list and their content +void displayVectorContent(std::vector v) { + std::cout << "Vector = {"; + if (v.size() > 0) { + for (auto it=v.begin(); it < v.end(); it++) { + std::cout << *it; + auto temp_it = it; + if(++temp_it != v.end()) { + std::cout << ", "; + } + } + } else { + std::cout << "}" << std::endl; + } +} + // Attempts to read application specific temporary file // This method is to be used for reading (or determing if it exists), // in order to keep file naming scheme consistent. @@ -428,7 +545,7 @@ std::tuple readTmpFile(uint32_t dv_ind, if (tmpFiles.empty() == false) { for (auto &x: tmpFiles) { if (containsString(x, tmpFileName)) { - fileContent = readTemporaryFile(x); + fileContent = readFile(x); fileExists = true; break; } @@ -436,5 +553,88 @@ std::tuple readTmpFile(uint32_t dv_ind, } return std::make_tuple(fileExists, fileContent); } + +// wrapper to return string expression of a rsmi_status_t return +std::string getRSMIStatusString(rsmi_status_t ret) { + const char *err_str; + rsmi_status_string(ret, &err_str); + return std::string(err_str); +} + +// Returns a tuple: +// boolean errorDetected = returns true, if error found retrieving system +// details +// string sysname = system name (os name) +// string nodename = name of the system's node on the network +// string release = os's release level +// string version = os's version level +// string machine = hardware type system is running on +// string domainName = domain name of the the system's node on the network +// string os_distribution = pretty name of os distribution +// (typically found in /etc/*-release file) +std::tuple getSystemDetails(void) { + struct utsname buf; + bool errorDetected = false; + std::string temp_data; + std::string sysname, nodename, release, version, machine; + std::string domainName = ""; + std::string os_distribution = ""; + + if (uname(&buf) < 0) { + errorDetected = true; + } else { + sysname = buf.sysname; + nodename = buf.nodename; + release = buf.release; + version = buf.version; + machine = buf.machine; + #ifdef _GNU_SOURCE + domainName = buf.domainname; + #endif + } + + std::string filePath = "/etc/os-release"; + bool fileExists = FileExists(filePath.c_str()); + if (fileExists == true) { + std::vector fileContent = readEntireFile(filePath); + for (auto &line: fileContent) { + if (line.find("PRETTY_NAME=") != std::string::npos) { + temp_data = removeString(line, "PRETTY_NAME="); + temp_data = removeString(temp_data, "\""); + os_distribution = temp_data; + break; + } + } + } + return std::make_tuple(errorDetected, sysname, nodename, release, + version, machine, domainName, os_distribution); +} + +// If logging is enabled through RSMI_LOGGING environment variable. +// We display helpful system metrics for debug purposes. +void logSystemDetails(void) { + std::ostringstream ss; + bool errorDetected; + std::string sysname, node, release, version, machine, domain, distName; + std::tie(errorDetected, sysname, node, release, version, machine, domain, + distName) = getSystemDetails(); + if (errorDetected == false) { + ss << "====== Gathered system details ============\n" + << "SYSTEM NAME: " << sysname << "\n" + << "OS DISTRIBUTION: " << distName << "\n" + << "NODE NAME: " << node << "\n" + << "RELEASE: " << release << "\n" + << "VERSION: " << version << "\n" + << "MACHINE TYPE: " << machine << "\n" + << "DOMAIN: " << domain << "\n"; + LOG_INFO(ss); + } else { + ss << "====== Gathered system details ============\n" + << "Could not retrieve system details"; + LOG_ERROR(ss); + } +} + } // namespace smi } // namespace amd