From 4e39fe3e25f60d944f65721d48b03c8626f138b3 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Tue, 1 Aug 2023 21:46:19 -0500 Subject: [PATCH] [lib] Enhance Logger: gpu_metrics + enable console out * Updates: - Env variable RSMI_LOGGING=0 or any other value -> all logging off - Env variable RSMI_LOGGING=1 -> logs only - Env variable RSMI_LOGGING=2 -> console only - Env variable RSMI_LOGGING=3 -> both logs + console - Metrics output includes hexdump of current file and decoded metrics (functions: logHexDump and log_gpu_metrics) - System info gathered, now includes if system's perceived endianness - little or big endian helpful for viewing decoded hexdump or any binary translation - Added templates for printing unsigned hex (print_unsigned_hex_and_int), unsigned integers (print_unsigned_int), and printing both unsigned hex and int with an optional header (print_unsigned_hex_and_int) - Fixed some build compile warnings/errors - ex. doing strncpys for sku or board names this operation is expected and needed and for temp file writes if unsuccessful we now properly send RSMI_STATUS_FILE_ERROR - Fixed on RHEL 8.8/9.x logrotate does not properly initialize Change-Id: Ifa0f0218c9cafd0a8cd6aa8e7f94d61e9107200f Signed-off-by: Charis Poag [ROCm/rocm_smi_lib commit: 9c7eed7edcb773acf62a9316a8f0eb2a95d67d57] --- projects/rocm-smi-lib/DEBIAN/postinst.in | 5 +- projects/rocm-smi-lib/RPM/post.in | 5 +- .../include/rocm_smi/rocm_smi_logger.h | 1 + .../include/rocm_smi/rocm_smi_main.h | 1 + .../include/rocm_smi/rocm_smi_utils.h | 49 +++- projects/rocm-smi-lib/oam/src/amd_oam.cc | 3 + projects/rocm-smi-lib/src/rocm_smi.cc | 4 +- projects/rocm-smi-lib/src/rocm_smi_device.cc | 7 +- .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 218 ++++++++++++++++++ projects/rocm-smi-lib/src/rocm_smi_logger.cc | 45 +++- projects/rocm-smi-lib/src/rocm_smi_main.cc | 29 ++- projects/rocm-smi-lib/src/rocm_smi_utils.cc | 127 +++++++++- 12 files changed, 470 insertions(+), 24 deletions(-) diff --git a/projects/rocm-smi-lib/DEBIAN/postinst.in b/projects/rocm-smi-lib/DEBIAN/postinst.in index a62d7e9eea..ab2f640553 100755 --- a/projects/rocm-smi-lib/DEBIAN/postinst.in +++ b/projects/rocm-smi-lib/DEBIAN/postinst.in @@ -62,12 +62,11 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ " logrotate. $packageName logs (when turned on) will not rotate properly." - return fi else # confirm that it's already been moved to hourly @@ -77,6 +76,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +102,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/rocm-smi-lib/RPM/post.in b/projects/rocm-smi-lib/RPM/post.in index 379dcd7152..b04e31c9f9 100755 --- a/projects/rocm-smi-lib/RPM/post.in +++ b/projects/rocm-smi-lib/RPM/post.in @@ -62,12 +62,11 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ " logrotate. $packageName logs (when turned on) will not rotate properly." - return fi else # confirm that it's already been moved to hourly @@ -77,6 +76,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +102,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h index 3ff1070418..bd2608db58 100644 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_logger.h @@ -100,6 +100,7 @@ typedef enum LOG_TYPE { NO_LOG = 1, CONSOLE = 2, FILE_LOG = 3, + BOTH_FILE_AND_CONSOLE = 4 } LogType; class Logger { diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index a64adddcc5..f276bd85bb 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -115,6 +115,7 @@ class RocmSMI { const RocmSMI_env_vars& getEnv(void); void printEnvVarInfo(void); bool isLoggingOn(void); + uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; private: diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index 087ca5faa9..a655b5b136 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -48,6 +48,9 @@ #include #include #include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -94,8 +97,52 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple getSystemDetails(void); + std::string, std::string, std::string, std::string> + getSystemDetails(void); void logSystemDetails(void); +void logHexDump(const char *desc, const void *addr, const size_t len, + size_t perLine); +bool isSystemBigEndian(); +template +std::string print_int_as_hex(T i, bool showHexNotation=true) { + std::stringstream ss; + if (showHexNotation) { + ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } else { + ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } + + if (std::is_same::value) { + ss << static_cast(i|0); + } else if (std::is_same::value) { + ss << static_cast(static_cast(i|0)); + } else if (std::is_signed::value) { + ss << static_cast(i | 0); + } else { + ss << static_cast(i | 0); + } + ss << std::dec; + return ss.str(); +}; + +template +std::string print_unsigned_int(T i) { + std::stringstream ss; + ss << static_cast(i | 0); + return ss.str(); +} + +template +std::string print_unsigned_hex_and_int(T i, std::string heading="") { + std::stringstream ss; + if (heading.empty() == false) { + ss << "\n" << heading << " = "; + } + ss << "Hex (MSB): " << print_int_as_hex(i) << ", " + << "Unsigned int: " << print_unsigned_int(i) << ", " + << "Byte Size: " << sizeof(T); + return ss.str(); +} struct pthread_wrap { public: diff --git a/projects/rocm-smi-lib/oam/src/amd_oam.cc b/projects/rocm-smi-lib/oam/src/amd_oam.cc index 8d63d94f40..62d4b28287 100755 --- a/projects/rocm-smi-lib/oam/src/amd_oam.cc +++ b/projects/rocm-smi-lib/oam/src/amd_oam.cc @@ -166,8 +166,11 @@ TRY rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN); rsmi_dev_vbios_version_get(dev_inx, buf, buf_size); if (std::strlen(buf) > 0) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" std::strncpy(dev->sku_name, &buf[4], 6); std::strncpy(dev->board_name, buf, 12); +#pragma GCC diagnostic pop } rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number, BOARD_SERIAL_NUM_LEN); diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index f06ecfac0b..61128c03eb 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -673,8 +673,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, default: ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED" - << amd::smi::getRSMIStatusString(ret); + << ", default case -> reporting " + << amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index bdc5984ce4..87077195ce 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -826,7 +826,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, } ss << "Successfully read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" - << sysfs_path << "), returning binaryData = " << p_binary_data; + << sysfs_path << "), returning binaryData = " << p_binary_data + << "; byte_size = " << std::dec << static_cast(b_size); + + std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), " + + sysfs_path; + logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16); LOG_INFO(ss); return 0; } diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 648b18a0e2..885c36d7f6 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -60,6 +60,10 @@ #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; +using namespace amd::smi; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -139,6 +143,196 @@ typedef struct { } rsmi_gpu_metrics_v_1_3; + +// log current gpu_metrics file content read +// any metrics value can be a nullptr +void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, + const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, + const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, + const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { + if (RocmSMI::getInstance().isLoggingOn() == false) { + return; + } + std::ostringstream ss; + if (gpu_metrics_table_header != nullptr) { + ss + /* Common Header */ + << print_unsigned_hex_and_int( + gpu_metrics_table_header->structure_size, + "gpu_metrics_table_header->structure_size") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->format_revision, + "gpu_metrics_table_header->format_revision") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->content_revision, + "gpu_metrics_table_header->content_revision"); + LOG_DEBUG(ss); + } + if (rsmi_gpu_metrics == nullptr) { + return; + } else { + // do nothing - continue + } + ss + /* Common Header */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.structure_size, + "rsmi_gpu_metrics->common_header.structure_size") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.format_revision, + "rsmi_gpu_metrics->common_header.format_revision") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.content_revision, + "rsmi_gpu_metrics->common_header.content_revision") + /* Temperature */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_edge, + "rsmi_gpu_metrics->temperature_edge") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hotspot, + "rsmi_gpu_metrics->temperature_hotspot") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_mem, + "rsmi_gpu_metrics->temperature_mem") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrgfx, + "rsmi_gpu_metrics->temperature_vrgfx") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrsoc, + "rsmi_gpu_metrics->temperature_vrsoc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrmem, + "rsmi_gpu_metrics->temperature_vrmem") + /* Utilization */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfx_activity, + "rsmi_gpu_metrics->average_gfx_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_umc_activity, + "rsmi_gpu_metrics->average_umc_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_mm_activity, + "rsmi_gpu_metrics->average_mm_activity") + /* Power/Energy */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socket_power, + "rsmi_gpu_metrics->average_socket_power") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->energy_accumulator, + "rsmi_gpu_metrics->energy_accumulator") + /* Driver attached timestamp (in ns) */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->system_clock_counter, + "rsmi_gpu_metrics->system_clock_counter") + /* Average clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfxclk_frequency, + "rsmi_gpu_metrics->average_gfxclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socclk_frequency, + "rsmi_gpu_metrics->average_socclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_uclk_frequency, + "rsmi_gpu_metrics->average_uclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk0_frequency, + "rsmi_gpu_metrics->average_vclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk0_frequency, + "rsmi_gpu_metrics->average_dclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk1_frequency, + "rsmi_gpu_metrics->average_vclk1_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk1_frequency, + "rsmi_gpu_metrics->average_dclk1_frequency") + /* Current clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_gfxclk, + "rsmi_gpu_metrics->current_gfxclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_socclk, + "rsmi_gpu_metrics->current_socclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_uclk, + "rsmi_gpu_metrics->current_uclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk0, + "rsmi_gpu_metrics->current_vclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk0, + "rsmi_gpu_metrics->current_dclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk1, + "rsmi_gpu_metrics->current_vclk1") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk1, + "rsmi_gpu_metrics->current_dclk1") + /* Throttle status */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->throttle_status, + "rsmi_gpu_metrics->throttle_status") + /* Fans */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_fan_speed, + "rsmi_gpu_metrics->current_fan_speed") + /* Link width/speed */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_width, + "rsmi_gpu_metrics->pcie_link_width") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_speed, + "rsmi_gpu_metrics->pcie_link_speed") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->padding, + "rsmi_gpu_metrics->padding") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->gfx_activity_acc, + "rsmi_gpu_metrics->gfx_activity_acc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->mem_actvity_acc, + "rsmi_gpu_metrics->mem_actvity_acc"); + for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { + ss << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hbm[i], + "rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]"); + } + + if (rsmi_gpu_metrics_v_1_2 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + rsmi_gpu_metrics_v_1_2->firmware_timestamp, + "rsmi_gpu_metrics_v_1_2->firmware_timestamp"); + } + + if (gpu_metrics_v_1_3 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->firmware_timestamp, + "gpu_metrics_v_1_3->firmware_timestamp") + /* Voltage (mV) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_soc, + "gpu_metrics_v_1_3->voltage_soc") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_gfx, + "gpu_metrics_v_1_3->voltage_gfx") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_mem, + "gpu_metrics_v_1_3->voltage_mem") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->padding1, + "gpu_metrics_v_1_3->padding1") + /* Throttle status (ASIC independent) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->indep_throttle_status, + "gpu_metrics_v_1_3->indep_throttle_status"); + } + LOG_DEBUG(ss); +} + static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, rsmi_gpu_metrics_t *data, uint8_t content_v) { assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 && @@ -268,16 +462,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { rsmi_gpu_metrics_v_1_3 smu_v_1_3; rsmi_status_t ret; + std::ostringstream ss; if (!dev->gpu_metrics_ver().structure_size) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver()); + log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr); if (ret != RSMI_STATUS_SUCCESS) { + ss << "Returning = " << getRSMIStatusString(ret) + << ",\ndev->gpu_metrics_ver().structure_size = " + << print_unsigned_int(dev->gpu_metrics_ver().structure_size) + << ", could not read common header"; + LOG_ERROR(ss); return ret; } } // only supports gpu_metrics_v1_x version if (dev->gpu_metrics_ver().format_revision != 1) { + ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) + << ",\ndev->gpu_metrics_ver().format_revision = " + << print_unsigned_int(dev->gpu_metrics_ver().format_revision) + << " was not equal to 1"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -289,19 +495,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { RSMI_GPU_METRICS_API_CONTENT_VER_1) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_t), smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_2) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2); map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_3) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3); map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu); } else { ret = GetGPUMetricsFormat1(dv_ind, smu, dev->gpu_metrics_ver().content_revision); + ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } if (ret != RSMI_STATUS_SUCCESS) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_logger.cc b/projects/rocm-smi-lib/src/rocm_smi_logger.cc index c900c613c1..0600654ef3 100644 --- a/projects/rocm-smi-lib/src/rocm_smi_logger.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_logger.cc @@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_INFO)) { + logOnConsole(data); + logIntoFile(data); } } @@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_TRACE)) { + logOnConsole(data); + logIntoFile(data); } } @@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_DEBUG)) { + logOnConsole(data); + logIntoFile(data); } } @@ -424,6 +445,9 @@ std::string Logger::getLogSettings() { case CONSOLE: logSettings += "LogType = CONSOLE"; break; + case BOTH_FILE_AND_CONSOLE: + logSettings += "LogType = BOTH_FILE_AND_CONSOLE"; + break; default: logSettings += "LogType = "; } @@ -471,7 +495,26 @@ void Logger::initialize_resources() { } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); m_LogLevel = LOG_LEVEL_TRACE; - m_LogType = FILE_LOG; + // RSMI_LOGGING = 1, output to logs only + // RSMI_LOGGING = 2, output to console only + // RSMI_LOGGING = 3, output to logs and console + switch (amd::smi::RocmSMI::getInstance().getLogSetting()) { + case 0: + m_LogType = NO_LOG; + break; + case 1: + m_LogType = FILE_LOG; + break; + case 2: + m_LogType = CONSOLE; + break; + case 3: + m_LogType = BOTH_FILE_AND_CONSOLE; + break; + default: + m_LogType = NO_LOG; + break; + } if (!m_File.is_open()) { std::cout << "WARNING: Issue opening log file (" << logFileName << ") to write." << std::endl; diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 92ffe5af4f..0ba6d7c50e 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -458,17 +458,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) { // provides a way to get env variable detail in both debug & release // helps enable full logging -static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) { - bool isLoggingEnabled = false; +// RSMI_LOGGING = 1, output to logs only +// RSMI_LOGGING = 2, output to console only +// RSMI_LOGGING = 3, output to logs and console +static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) { + uint32_t ret = 0; ev_str = getenv(ev_str); - if (ev_str != nullptr) { - isLoggingEnabled = true; + int ev_ret = atoi(ev_str); + ret = static_cast(ev_ret); } - return isLoggingEnabled; + return ret; } -static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { +static inline std::unordered_set GetEnvVarUIntegerSets( + const char *ev_str) { std::unordered_set returnSet; #ifndef DEBUG (void)ev_str; @@ -519,7 +523,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) { } bool RocmSMI::isLoggingOn(void) { + bool isLoggingOn = false; GetEnvVariables(); + if (this->env_vars_.logging_on > 0 + && this->env_vars_.logging_on <= 3) { + isLoggingOn = true; + } + return isLoggingOn; +} + +uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } @@ -544,7 +557,9 @@ void RocmSMI::printEnvVarInfo(void) { << ((env_vars_.debug_inf_loop == 0) ? "" : std::to_string(env_vars_.debug_inf_loop)) << std::endl; - bool isLoggingOn = (env_vars_.logging_on) ? true : false; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " + << getLogSetting() << std::endl; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 2cbb936454..3c997ccf9d 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -103,7 +105,7 @@ bool FileExists(char const *filename) { return (stat(filename, &buf) == 0); } -static void debugFilesDiscovered(std::vector files) { +static inline void debugFilesDiscovered(std::vector files) { std::ostringstream ss; int numberOfFilesFound = static_cast(files.size()); ss << "fileName.size() = " << numberOfFilesFound @@ -435,9 +437,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, } chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH); - write(fd, storageData.c_str(), storageData.size()); + ssize_t rc_write = write(fd, storageData.c_str(), storageData.size()); close(fd); - return RSMI_STATUS_SUCCESS; + if (rc_write == -1) { + return RSMI_STATUS_FILE_ERROR; + } else { + return RSMI_STATUS_SUCCESS; + } } std::vector getListOfAppTmpFiles() { @@ -573,14 +579,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // string domainName = domain name of the the system's node on the network // string os_distribution = pretty name of os distribution // (typically found in /etc/*-release file) +// string endianness = system's endianness. +// Expressed as big endian or little endian. +// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) +// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple getSystemDetails(void) { + std::string, std::string, std::string, std::string> + getSystemDetails(void) { struct utsname buf; bool errorDetected = false; std::string temp_data; std::string sysname, nodename, release, version, machine; std::string domainName = ""; std::string os_distribution = ""; + std::string endianness = ""; if (uname(&buf) < 0) { errorDetected = true; @@ -608,8 +620,16 @@ std::tuple 64) bytesPerLine = 16; + + size_t i; + unsigned char buff[bytesPerLine + 1]; + const unsigned char *pc // ptr to data (char, 1 byte sized data) + = (const unsigned char *) addr; + + // Output description if given. + // if (desc != NULL) printf("%s:\n", desc); + if (desc != NULL) ss << "\n" << desc << "\n"; + + // Length checks. + if (len == 0) { + // printf(" ZERO LENGTH\n"); + ss << " ZERO LENGTH\n"; + LOG_ERROR(ss); + return; + } + std::string endianness = ""; + if (isSystemBigEndian()) { + endianness = "** System is Big Endian, multi-bit symbols encoded as" + " big endian (MSB first) **"; + } else { + endianness = "** System is Little Endian, multi-bit symbols encoded as" + " little endian (LSB first) **"; + } + ss << "\t" << endianness << "\n"; + + // Process every byte in the data. + for (i = 0; i < len; i++) { + // Multiple of bytesPerLine means new or first line (with line offset). + if ((i % bytesPerLine) == 0) { + // Only print previous-line ASCII buffer for lines beyond first. + // if (i != 0) printf(" %s\n", buff); + if (i != 0) ss << " " << buff << "\n"; + // Output the offset of current line. + // printf(" %08lx ", i); + ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " "; + } + + // Now the hex code for the specific character. + // printf(" %02x", pc[i]); + + ss << " " << std::setw(2) << std::setfill('0') << std::hex + << static_cast(pc[i]); + + // And buffer a printable ASCII character for later. + // x20 = 32 || x7e = 126 (ascii table range) + if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better. + buff[i % bytesPerLine] = '.'; + } else { + buff[i % bytesPerLine] = pc[i]; + } + buff[(i % bytesPerLine) + 1] = '\0'; + } + + // Pad out last line if not exactly bytesPerLine characters. + while ((i % bytesPerLine) != 0) { + // printf(" "); + ss << " "; + i++; + } + + // And print the final ASCII buffer. + // printf(" %s\n", buff); + ss << " " << buff << "\n"; + LOG_DEBUG(ss); +} + +bool isSystemBigEndian() { + int n = 1; + bool isBigEndian = true; + if (*(char *)&n == 1) { + isBigEndian = false; + } + return isBigEndian; +} + } // namespace smi } // namespace amd