[lib] Enhance Logger: gpu_metrics + enable console out
* Updates:
- Env variable RSMI_LOGGING=0 or any other value
-> all logging off
- Env variable RSMI_LOGGING=1 -> logs only
- Env variable RSMI_LOGGING=2 -> console only
- Env variable RSMI_LOGGING=3 -> both logs + console
- Metrics output includes hexdump of current file
and decoded metrics (functions: logHexDump
and log_gpu_metrics)
- System info gathered, now includes if system's
perceived endianness - little or big endian
helpful for viewing decoded hexdump or any
binary translation
- Added templates for printing unsigned hex
(print_unsigned_hex_and_int), unsigned integers
(print_unsigned_int), and printing both unsigned
hex and int with an optional header
(print_unsigned_hex_and_int)
- Fixed some build compile warnings/errors -
ex. doing strncpys for sku or board names
this operation is expected and needed
and for temp file writes if unsuccessful
we now properly send RSMI_STATUS_FILE_ERROR
- Fixed on RHEL 8.8/9.x logrotate does not properly
initialize
Change-Id: Ifa0f0218c9cafd0a8cd6aa8e7f94d61e9107200f
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/rocm_smi_lib commit: 9c7eed7edc]
Этот коммит содержится в:
@@ -62,12 +62,11 @@ EOF
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
" logrotate. $packageName logs (when turned on) will not rotate properly."
|
||||
return
|
||||
fi
|
||||
else
|
||||
# confirm that it's already been moved to hourly
|
||||
@@ -77,6 +76,7 @@ EOF
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
@@ -102,6 +102,7 @@ EOF
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -62,12 +62,11 @@ EOF
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
" logrotate. $packageName logs (when turned on) will not rotate properly."
|
||||
return
|
||||
fi
|
||||
else
|
||||
# confirm that it's already been moved to hourly
|
||||
@@ -77,6 +76,7 @@ EOF
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
@@ -102,6 +102,7 @@ EOF
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -100,6 +100,7 @@ typedef enum LOG_TYPE {
|
||||
NO_LOG = 1,
|
||||
CONSOLE = 2,
|
||||
FILE_LOG = 3,
|
||||
BOTH_FILE_AND_CONSOLE = 4
|
||||
} LogType;
|
||||
|
||||
class Logger {
|
||||
|
||||
@@ -115,6 +115,7 @@ class RocmSMI {
|
||||
const RocmSMI_env_vars& getEnv(void);
|
||||
void printEnvVarInfo(void);
|
||||
bool isLoggingOn(void);
|
||||
uint32_t getLogSetting(void);
|
||||
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
|
||||
|
||||
private:
|
||||
|
||||
@@ -48,6 +48,9 @@
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
|
||||
@@ -94,8 +97,52 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
|
||||
rsmi_status_t ErrnoToRsmiStatus(int err);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret);
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string> getSystemDetails(void);
|
||||
std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void);
|
||||
void logSystemDetails(void);
|
||||
void logHexDump(const char *desc, const void *addr, const size_t len,
|
||||
size_t perLine);
|
||||
bool isSystemBigEndian();
|
||||
template <typename T>
|
||||
std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
std::stringstream ss;
|
||||
if (showHexNotation) {
|
||||
ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
|
||||
} else {
|
||||
ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
|
||||
}
|
||||
|
||||
if (std::is_same<std::uint8_t, T>::value) {
|
||||
ss << static_cast<unsigned int>(i|0);
|
||||
} else if (std::is_same<std::int8_t, T>::value) {
|
||||
ss << static_cast<int>(static_cast<uint8_t>(i|0));
|
||||
} else if (std::is_signed<T>::value) {
|
||||
ss << static_cast<long long int>(i | 0);
|
||||
} else {
|
||||
ss << static_cast<unsigned long long int>(i | 0);
|
||||
}
|
||||
ss << std::dec;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_int(T i) {
|
||||
std::stringstream ss;
|
||||
ss << static_cast<unsigned long long int>(i | 0);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_hex_and_int(T i, std::string heading="") {
|
||||
std::stringstream ss;
|
||||
if (heading.empty() == false) {
|
||||
ss << "\n" << heading << " = ";
|
||||
}
|
||||
ss << "Hex (MSB): " << print_int_as_hex(i) << ", "
|
||||
<< "Unsigned int: " << print_unsigned_int(i) << ", "
|
||||
<< "Byte Size: " << sizeof(T);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
struct pthread_wrap {
|
||||
public:
|
||||
|
||||
@@ -166,8 +166,11 @@ TRY
|
||||
rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN);
|
||||
rsmi_dev_vbios_version_get(dev_inx, buf, buf_size);
|
||||
if (std::strlen(buf) > 0) {
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
std::strncpy(dev->sku_name, &buf[4], 6);
|
||||
std::strncpy(dev->board_name, buf, 12);
|
||||
#pragma GCC diagnostic pop
|
||||
}
|
||||
rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number,
|
||||
BOARD_SERIAL_NUM_LEN);
|
||||
|
||||
@@ -673,8 +673,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
|
||||
default:
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED"
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
<< ", default case -> reporting "
|
||||
<< amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -826,7 +826,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
}
|
||||
ss << "Successfully read DevInfoBinary for DevInfoType ("
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS ("
|
||||
<< sysfs_path << "), returning binaryData = " << p_binary_data;
|
||||
<< sysfs_path << "), returning binaryData = " << p_binary_data
|
||||
<< "; byte_size = " << std::dec << static_cast<int>(b_size);
|
||||
|
||||
std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), "
|
||||
+ sysfs_path;
|
||||
logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16);
|
||||
LOG_INFO(ss);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -60,6 +60,10 @@
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
using namespace ROCmLogging;
|
||||
using namespace amd::smi;
|
||||
|
||||
#define TRY try {
|
||||
#define CATCH } catch (...) {return amd::smi::handleException();}
|
||||
@@ -139,6 +143,196 @@ typedef struct {
|
||||
|
||||
} rsmi_gpu_metrics_v_1_3;
|
||||
|
||||
|
||||
// log current gpu_metrics file content read
|
||||
// any metrics value can be a nullptr
|
||||
void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2,
|
||||
const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3,
|
||||
const rsmi_gpu_metrics_t *rsmi_gpu_metrics) {
|
||||
if (RocmSMI::getInstance().isLoggingOn() == false) {
|
||||
return;
|
||||
}
|
||||
std::ostringstream ss;
|
||||
if (gpu_metrics_table_header != nullptr) {
|
||||
ss
|
||||
/* Common Header */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->structure_size,
|
||||
"gpu_metrics_table_header->structure_size")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->format_revision,
|
||||
"gpu_metrics_table_header->format_revision")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->content_revision,
|
||||
"gpu_metrics_table_header->content_revision");
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if (rsmi_gpu_metrics == nullptr) {
|
||||
return;
|
||||
} else {
|
||||
// do nothing - continue
|
||||
}
|
||||
ss
|
||||
/* Common Header */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.structure_size,
|
||||
"rsmi_gpu_metrics->common_header.structure_size")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.format_revision,
|
||||
"rsmi_gpu_metrics->common_header.format_revision")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.content_revision,
|
||||
"rsmi_gpu_metrics->common_header.content_revision")
|
||||
/* Temperature */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_edge,
|
||||
"rsmi_gpu_metrics->temperature_edge")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hotspot,
|
||||
"rsmi_gpu_metrics->temperature_hotspot")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_mem,
|
||||
"rsmi_gpu_metrics->temperature_mem")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrgfx,
|
||||
"rsmi_gpu_metrics->temperature_vrgfx")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrsoc,
|
||||
"rsmi_gpu_metrics->temperature_vrsoc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrmem,
|
||||
"rsmi_gpu_metrics->temperature_vrmem")
|
||||
/* Utilization */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_gfx_activity,
|
||||
"rsmi_gpu_metrics->average_gfx_activity")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_umc_activity,
|
||||
"rsmi_gpu_metrics->average_umc_activity")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_mm_activity,
|
||||
"rsmi_gpu_metrics->average_mm_activity")
|
||||
/* Power/Energy */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_socket_power,
|
||||
"rsmi_gpu_metrics->average_socket_power")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->energy_accumulator,
|
||||
"rsmi_gpu_metrics->energy_accumulator")
|
||||
/* Driver attached timestamp (in ns) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->system_clock_counter,
|
||||
"rsmi_gpu_metrics->system_clock_counter")
|
||||
/* Average clocks */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_gfxclk_frequency,
|
||||
"rsmi_gpu_metrics->average_gfxclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_socclk_frequency,
|
||||
"rsmi_gpu_metrics->average_socclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_uclk_frequency,
|
||||
"rsmi_gpu_metrics->average_uclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_vclk0_frequency,
|
||||
"rsmi_gpu_metrics->average_vclk0_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_dclk0_frequency,
|
||||
"rsmi_gpu_metrics->average_dclk0_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_vclk1_frequency,
|
||||
"rsmi_gpu_metrics->average_vclk1_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_dclk1_frequency,
|
||||
"rsmi_gpu_metrics->average_dclk1_frequency")
|
||||
/* Current clocks */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_gfxclk,
|
||||
"rsmi_gpu_metrics->current_gfxclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_socclk,
|
||||
"rsmi_gpu_metrics->current_socclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_uclk,
|
||||
"rsmi_gpu_metrics->current_uclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_vclk0,
|
||||
"rsmi_gpu_metrics->current_vclk0")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_dclk0,
|
||||
"rsmi_gpu_metrics->current_dclk0")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_vclk1,
|
||||
"rsmi_gpu_metrics->current_vclk1")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_dclk1,
|
||||
"rsmi_gpu_metrics->current_dclk1")
|
||||
/* Throttle status */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->throttle_status,
|
||||
"rsmi_gpu_metrics->throttle_status")
|
||||
/* Fans */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_fan_speed,
|
||||
"rsmi_gpu_metrics->current_fan_speed")
|
||||
/* Link width/speed */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->pcie_link_width,
|
||||
"rsmi_gpu_metrics->pcie_link_width")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->pcie_link_speed,
|
||||
"rsmi_gpu_metrics->pcie_link_speed")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->padding,
|
||||
"rsmi_gpu_metrics->padding")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->gfx_activity_acc,
|
||||
"rsmi_gpu_metrics->gfx_activity_acc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->mem_actvity_acc,
|
||||
"rsmi_gpu_metrics->mem_actvity_acc");
|
||||
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
|
||||
ss << print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hbm[i],
|
||||
"rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]");
|
||||
}
|
||||
|
||||
if (rsmi_gpu_metrics_v_1_2 != nullptr) {
|
||||
/* PMFW attached timestamp (10ns resolution) */
|
||||
ss
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics_v_1_2->firmware_timestamp,
|
||||
"rsmi_gpu_metrics_v_1_2->firmware_timestamp");
|
||||
}
|
||||
|
||||
if (gpu_metrics_v_1_3 != nullptr) {
|
||||
/* PMFW attached timestamp (10ns resolution) */
|
||||
ss
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->firmware_timestamp,
|
||||
"gpu_metrics_v_1_3->firmware_timestamp")
|
||||
/* Voltage (mV) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->voltage_soc,
|
||||
"gpu_metrics_v_1_3->voltage_soc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->voltage_gfx,
|
||||
"gpu_metrics_v_1_3->voltage_gfx")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->voltage_mem,
|
||||
"gpu_metrics_v_1_3->voltage_mem")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->padding1,
|
||||
"gpu_metrics_v_1_3->padding1")
|
||||
/* Throttle status (ASIC independent) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->indep_throttle_status,
|
||||
"gpu_metrics_v_1_3->indep_throttle_status");
|
||||
}
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
rsmi_gpu_metrics_t *data, uint8_t content_v) {
|
||||
assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 &&
|
||||
@@ -268,16 +462,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
rsmi_gpu_metrics_v_1_3 smu_v_1_3;
|
||||
rsmi_status_t ret;
|
||||
|
||||
std::ostringstream ss;
|
||||
if (!dev->gpu_metrics_ver().structure_size) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver());
|
||||
log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << "Returning = " << getRSMIStatusString(ret)
|
||||
<< ",\ndev->gpu_metrics_ver().structure_size = "
|
||||
<< print_unsigned_int(dev->gpu_metrics_ver().structure_size)
|
||||
<< ", could not read common header";
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
// only supports gpu_metrics_v1_x version
|
||||
if (dev->gpu_metrics_ver().format_revision != 1) {
|
||||
ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED)
|
||||
<< ",\ndev->gpu_metrics_ver().format_revision = "
|
||||
<< print_unsigned_int(dev->gpu_metrics_ver().format_revision)
|
||||
<< " was not equal to 1";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -289,19 +495,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_1) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_t), smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
|
||||
} else if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_2) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2);
|
||||
map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu);
|
||||
} else if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_3) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3);
|
||||
map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu);
|
||||
} else {
|
||||
ret = GetGPUMetricsFormat1(dv_ind, smu,
|
||||
dev->gpu_metrics_ver().content_revision);
|
||||
ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
|
||||
}
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
|
||||
@@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_INFO)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_TRACE)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_DEBUG)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -424,6 +445,9 @@ std::string Logger::getLogSettings() {
|
||||
case CONSOLE:
|
||||
logSettings += "LogType = CONSOLE";
|
||||
break;
|
||||
case BOTH_FILE_AND_CONSOLE:
|
||||
logSettings += "LogType = BOTH_FILE_AND_CONSOLE";
|
||||
break;
|
||||
default:
|
||||
logSettings += "LogType = <undefined>";
|
||||
}
|
||||
@@ -471,7 +495,26 @@ void Logger::initialize_resources() {
|
||||
}
|
||||
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
|
||||
m_LogLevel = LOG_LEVEL_TRACE;
|
||||
m_LogType = FILE_LOG;
|
||||
// RSMI_LOGGING = 1, output to logs only
|
||||
// RSMI_LOGGING = 2, output to console only
|
||||
// RSMI_LOGGING = 3, output to logs and console
|
||||
switch (amd::smi::RocmSMI::getInstance().getLogSetting()) {
|
||||
case 0:
|
||||
m_LogType = NO_LOG;
|
||||
break;
|
||||
case 1:
|
||||
m_LogType = FILE_LOG;
|
||||
break;
|
||||
case 2:
|
||||
m_LogType = CONSOLE;
|
||||
break;
|
||||
case 3:
|
||||
m_LogType = BOTH_FILE_AND_CONSOLE;
|
||||
break;
|
||||
default:
|
||||
m_LogType = NO_LOG;
|
||||
break;
|
||||
}
|
||||
if (!m_File.is_open()) {
|
||||
std::cout << "WARNING: Issue opening log file (" << logFileName
|
||||
<< ") to write." << std::endl;
|
||||
|
||||
@@ -458,17 +458,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) {
|
||||
|
||||
// provides a way to get env variable detail in both debug & release
|
||||
// helps enable full logging
|
||||
static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
|
||||
bool isLoggingEnabled = false;
|
||||
// RSMI_LOGGING = 1, output to logs only
|
||||
// RSMI_LOGGING = 2, output to console only
|
||||
// RSMI_LOGGING = 3, output to logs and console
|
||||
static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
|
||||
uint32_t ret = 0;
|
||||
ev_str = getenv(ev_str);
|
||||
|
||||
if (ev_str != nullptr) {
|
||||
isLoggingEnabled = true;
|
||||
int ev_ret = atoi(ev_str);
|
||||
ret = static_cast<uint32_t>(ev_ret);
|
||||
}
|
||||
return isLoggingEnabled;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::unordered_set<uint32_t> GetEnvVarUIntegerSets(const char *ev_str) {
|
||||
static inline std::unordered_set<uint32_t> GetEnvVarUIntegerSets(
|
||||
const char *ev_str) {
|
||||
std::unordered_set<uint32_t> returnSet;
|
||||
#ifndef DEBUG
|
||||
(void)ev_str;
|
||||
@@ -519,7 +523,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) {
|
||||
}
|
||||
|
||||
bool RocmSMI::isLoggingOn(void) {
|
||||
bool isLoggingOn = false;
|
||||
GetEnvVariables();
|
||||
if (this->env_vars_.logging_on > 0
|
||||
&& this->env_vars_.logging_on <= 3) {
|
||||
isLoggingOn = true;
|
||||
}
|
||||
return isLoggingOn;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::getLogSetting() {
|
||||
return this->env_vars_.logging_on;
|
||||
}
|
||||
|
||||
@@ -544,7 +557,9 @@ void RocmSMI::printEnvVarInfo(void) {
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
bool isLoggingOn = (env_vars_.logging_on) ? true : false;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< getLogSetting() << std::endl;
|
||||
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< (isLoggingOn ? "true" : "false") << std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
|
||||
|
||||
@@ -57,6 +57,8 @@
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <iomanip>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
@@ -103,7 +105,7 @@ bool FileExists(char const *filename) {
|
||||
return (stat(filename, &buf) == 0);
|
||||
}
|
||||
|
||||
static void debugFilesDiscovered(std::vector<std::string> files) {
|
||||
static inline void debugFilesDiscovered(std::vector<std::string> files) {
|
||||
std::ostringstream ss;
|
||||
int numberOfFilesFound = static_cast<int>(files.size());
|
||||
ss << "fileName.size() = " << numberOfFilesFound
|
||||
@@ -435,9 +437,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
|
||||
}
|
||||
|
||||
chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH);
|
||||
write(fd, storageData.c_str(), storageData.size());
|
||||
ssize_t rc_write = write(fd, storageData.c_str(), storageData.size());
|
||||
close(fd);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
if (rc_write == -1) {
|
||||
return RSMI_STATUS_FILE_ERROR;
|
||||
} else {
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> getListOfAppTmpFiles() {
|
||||
@@ -573,14 +579,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// string domainName = domain name of the the system's node on the network
|
||||
// string os_distribution = pretty name of os distribution
|
||||
// (typically found in /etc/*-release file)
|
||||
// string endianness = system's endianness.
|
||||
// Expressed as big endian or little endian.
|
||||
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
|
||||
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string> getSystemDetails(void) {
|
||||
std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void) {
|
||||
struct utsname buf;
|
||||
bool errorDetected = false;
|
||||
std::string temp_data;
|
||||
std::string sysname, nodename, release, version, machine;
|
||||
std::string domainName = "<undefined>";
|
||||
std::string os_distribution = "<undefined>";
|
||||
std::string endianness = "<undefined>";
|
||||
|
||||
if (uname(&buf) < 0) {
|
||||
errorDetected = true;
|
||||
@@ -608,8 +620,16 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isSystemBigEndian()) {
|
||||
endianness = "Big Endian, multi-bit symbols encoded as"
|
||||
" big endian (MSB first)";
|
||||
} else {
|
||||
endianness = "Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first)";
|
||||
}
|
||||
return std::make_tuple(errorDetected, sysname, nodename, release,
|
||||
version, machine, domainName, os_distribution);
|
||||
version, machine, domainName, os_distribution,
|
||||
endianness);
|
||||
}
|
||||
|
||||
// If logging is enabled through RSMI_LOGGING environment variable.
|
||||
@@ -617,9 +637,10 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
void logSystemDetails(void) {
|
||||
std::ostringstream ss;
|
||||
bool errorDetected;
|
||||
std::string sysname, node, release, version, machine, domain, distName;
|
||||
std::string sysname, node, release, version, machine, domain, distName,
|
||||
endianness;
|
||||
std::tie(errorDetected, sysname, node, release, version, machine, domain,
|
||||
distName) = getSystemDetails();
|
||||
distName, endianness) = getSystemDetails();
|
||||
if (errorDetected == false) {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
<< "SYSTEM NAME: " << sysname << "\n"
|
||||
@@ -628,7 +649,8 @@ void logSystemDetails(void) {
|
||||
<< "RELEASE: " << release << "\n"
|
||||
<< "VERSION: " << version << "\n"
|
||||
<< "MACHINE TYPE: " << machine << "\n"
|
||||
<< "DOMAIN: " << domain << "\n";
|
||||
<< "DOMAIN: " << domain << "\n"
|
||||
<< "ENDIANNESS: " << endianness << "\n";
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
@@ -637,5 +659,94 @@ void logSystemDetails(void) {
|
||||
}
|
||||
}
|
||||
|
||||
// Usage:
|
||||
// logHexDump(desc, addr, len, bytesPerLine);
|
||||
// desc: if non-NULL, printed as a description before hex dump.
|
||||
// addr: the address to start dumping from.
|
||||
// len: the number of bytes to dump.
|
||||
// bytesPerLine: number of bytes on each output line.
|
||||
void logHexDump(
|
||||
const char *desc, const void *addr, const size_t len, size_t bytesPerLine) {
|
||||
// UNCOMMENT: printf lines if you want to see directly to stdout
|
||||
std::ostringstream ss;
|
||||
// Silently ignore per-line values.
|
||||
if (bytesPerLine < 4 || bytesPerLine > 64) bytesPerLine = 16;
|
||||
|
||||
size_t i;
|
||||
unsigned char buff[bytesPerLine + 1];
|
||||
const unsigned char *pc // ptr to data (char, 1 byte sized data)
|
||||
= (const unsigned char *) addr;
|
||||
|
||||
// Output description if given.
|
||||
// if (desc != NULL) printf("%s:\n", desc);
|
||||
if (desc != NULL) ss << "\n" << desc << "\n";
|
||||
|
||||
// Length checks.
|
||||
if (len == 0) {
|
||||
// printf(" ZERO LENGTH\n");
|
||||
ss << " ZERO LENGTH\n";
|
||||
LOG_ERROR(ss);
|
||||
return;
|
||||
}
|
||||
std::string endianness = "<undefined>";
|
||||
if (isSystemBigEndian()) {
|
||||
endianness = "** System is Big Endian, multi-bit symbols encoded as"
|
||||
" big endian (MSB first) **";
|
||||
} else {
|
||||
endianness = "** System is Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first) **";
|
||||
}
|
||||
ss << "\t" << endianness << "\n";
|
||||
|
||||
// Process every byte in the data.
|
||||
for (i = 0; i < len; i++) {
|
||||
// Multiple of bytesPerLine means new or first line (with line offset).
|
||||
if ((i % bytesPerLine) == 0) {
|
||||
// Only print previous-line ASCII buffer for lines beyond first.
|
||||
// if (i != 0) printf(" %s\n", buff);
|
||||
if (i != 0) ss << " " << buff << "\n";
|
||||
// Output the offset of current line.
|
||||
// printf(" %08lx ", i);
|
||||
ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " ";
|
||||
}
|
||||
|
||||
// Now the hex code for the specific character.
|
||||
// printf(" %02x", pc[i]);
|
||||
|
||||
ss << " " << std::setw(2) << std::setfill('0') << std::hex
|
||||
<< static_cast<unsigned>(pc[i]);
|
||||
|
||||
// And buffer a printable ASCII character for later.
|
||||
// x20 = 32 || x7e = 126 (ascii table range)
|
||||
if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better.
|
||||
buff[i % bytesPerLine] = '.';
|
||||
} else {
|
||||
buff[i % bytesPerLine] = pc[i];
|
||||
}
|
||||
buff[(i % bytesPerLine) + 1] = '\0';
|
||||
}
|
||||
|
||||
// Pad out last line if not exactly bytesPerLine characters.
|
||||
while ((i % bytesPerLine) != 0) {
|
||||
// printf(" ");
|
||||
ss << " ";
|
||||
i++;
|
||||
}
|
||||
|
||||
// And print the final ASCII buffer.
|
||||
// printf(" %s\n", buff);
|
||||
ss << " " << buff << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
bool isSystemBigEndian() {
|
||||
int n = 1;
|
||||
bool isBigEndian = true;
|
||||
if (*(char *)&n == 1) {
|
||||
isBigEndian = false;
|
||||
}
|
||||
return isBigEndian;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
Ссылка в новой задаче
Block a user