Add Current (Instant) Socket Power

* Updates:
    - rocm_smi_logger:
      General cleanup &
      Aligned to cpplint rules for usage
    - rocm_smi_monitor:
      Fixed MonitorTypes
      from not displaying properly in logs
      & Added socket power label + current
      socket power MonitorTypes
    - rocm_smi API:
      Added rsmi_dev_current_socket_power_get API
    - rocm_smi CLI:
      General cleanup,
      Concise info now displays device data
      in variable width (see printLogSpacer's
      new field),
      printLogSpacer now as an adjustable
      variable that overrides appWidth,
      Added Socket Power to base rocm-smi +
      --showpower CLI calls,
      --showpower & base rocm-smi CLI defaults
      to printing socket power (if not available,
      displays average power)
    - Cleaned up temp label references
    - power_read gtests:
      Added current socket power to testing

Change-Id: Ica57e6f98ad96e2584e7c7955e188f68d2dab89d
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: f078375350]
Tá an tiomantas seo le fáil i:
Charis Poag
2023-09-24 02:29:07 -05:00
tiomanta ag Dmitrii Galantsev
tuismitheoir 80c47e3c09
tiomantas fd5066437b
D'athraigh 13 comhad le 387 breiseanna agus 195 scriosta
@@ -1707,6 +1707,30 @@ rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask);
rsmi_status_t
rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power);
/**
* @brief Get the current socket power (also known as instant
* power) of the device index provided.
*
* @details Given a device index @p dv_ind and a pointer to a uint64_t
* @p socket_power, this function will write the current socket power
* (in microwatts) to the uint64_t pointed to by @p socket_power.
*
* @param[in] dv_ind a device index
*
* @param[inout] socket_power a pointer to uint64_t to which the current
* socket power will be written to. If this parameter is nullptr,
* this function will return ::RSMI_STATUS_INVALID_ARGS if the function is
* supported with the provided, arguments and ::RSMI_STATUS_NOT_SUPPORTED
* if it is not supported with the provided arguments.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*/
rsmi_status_t
rsmi_dev_current_socket_power_get(uint32_t dv_ind, uint64_t *socket_power);
/**
* @brief Get the energy accumulator counter of the device with provided
* device index.
@@ -130,18 +130,18 @@ class Logger {
break;
}
return *getInstance();
};
}
Logger &operator<<(const char* s) {
return operator<<(std::string(s));
};
}
template <class T> Logger &operator<<(const T &v) {
std::ostringstream s;
s << v;
std::string str = s.str();
return operator<<(str);
};
}
// Interface for Error Log
void error(const char* text) throw();
@@ -5,7 +5,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -67,6 +67,8 @@ enum MonitorTypes {
kMonPowerCapMax,
kMonPowerCapMin,
kMonPowerAve,
kMonPowerInput,
kMonPowerLabel,
kMonTempMax,
kMonTempMin,
kMonTempMaxHyst,
@@ -94,45 +96,47 @@ enum MonitorTypes {
kMonInvalid = 0xFFFFFFFF,
};
const std::map<MonitorTypes,std::string> monitorTypesToString {
{MonitorTypes::kMonName, "amd::smi::kMonName"},
{MonitorTypes::kMonTemp, "amd::smi::kMonName"},
{MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"},
{MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCap, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerAve, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMax, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritical, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempOffset, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonVolt, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMax, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMin, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonInvalid, "amd::smi::kMonName"},
const std::map<MonitorTypes, std::string> monitorTypesToString{
{MonitorTypes::kMonName, "MonitorTypes::kMonName"},
{MonitorTypes::kMonTemp, "MonitorTypes::kMonTemp"},
{MonitorTypes::kMonFanSpeed, "MonitorTypes::kMonFanSpeed"},
{MonitorTypes::kMonMaxFanSpeed, "MonitorTypes::kMonMaxFanSpeed"},
{MonitorTypes::kMonFanRPMs, "MonitorTypes::kMonFanRPMs"},
{MonitorTypes::kMonFanCntrlEnable, "MonitorTypes::kMonFanCntrlEnable"},
{MonitorTypes::kMonPowerCap, "MonitorTypes::kMonPowerCap"},
{MonitorTypes::kMonPowerCapDefault, "MonitorTypes::kMonPowerCapDefault"},
{MonitorTypes::kMonPowerCapMax, "MonitorTypes::kMonPowerCapMax"},
{MonitorTypes::kMonPowerCapMin, "MonitorTypes::kMonPowerCapMin"},
{MonitorTypes::kMonPowerAve, "MonitorTypes::kMonPowerAve"},
{MonitorTypes::kMonPowerInput, "MonitorTypes::kMonPowerInput"},
{MonitorTypes::kMonPowerLabel, "MonitorTypes::kMonPowerLabel"},
{MonitorTypes::kMonTempMax, "MonitorTypes::kMonTempMax"},
{MonitorTypes::kMonTempMin, "MonitorTypes::kMonTempMin"},
{MonitorTypes::kMonTempMaxHyst, "MonitorTypes::kMonTempMaxHyst"},
{MonitorTypes::kMonTempMinHyst, "MonitorTypes::kMonTempMinHyst"},
{MonitorTypes::kMonTempCritical, "MonitorTypes::kMonTempCritical"},
{MonitorTypes::kMonTempCriticalHyst, "MonitorTypes::kMonTempCriticalHyst"},
{MonitorTypes::kMonTempEmergency, "MonitorTypes::kMonTempEmergency"},
{MonitorTypes::kMonTempEmergencyHyst,
"MonitorTypes::kMonTempEmergencyHyst"},
{MonitorTypes::kMonTempCritMin, "MonitorTypes::kMonTempCritMin"},
{MonitorTypes::kMonTempCritMinHyst, "MonitorTypes::kMonTempCritMinHyst"},
{MonitorTypes::kMonTempOffset, "MonitorTypes::kMonTempOffset"},
{MonitorTypes::kMonTempLowest, "MonitorTypes::kMonTempLowest"},
{MonitorTypes::kMonTempHighest, "MonitorTypes::kMonTempHighest"},
{MonitorTypes::kMonTempLabel, "MonitorTypes::kMonTempLabel"},
{MonitorTypes::kMonVolt, "MonitorTypes::kMonVolt"},
{MonitorTypes::kMonVoltMax, "MonitorTypes::kMonVoltMax"},
{MonitorTypes::kMonVoltMinCrit, "MonitorTypes::kMonVoltMinCrit"},
{MonitorTypes::kMonVoltMin, "MonitorTypes::kMonVoltMin"},
{MonitorTypes::kMonVoltMaxCrit, "MonitorTypes::kMonVoltMaxCrit"},
{MonitorTypes::kMonVoltAverage, "MonitorTypes::kMonVoltAverage"},
{MonitorTypes::kMonVoltLowest, "MonitorTypes::kMonVoltLowest"},
{MonitorTypes::kMonVoltHighest, "MonitorTypes::kMonVoltHighest"},
{MonitorTypes::kMonVoltLabel, "MonitorTypes::kMonVoltLabel"},
{MonitorTypes::kMonInvalid, "MonitorTypes::kMonInvalid"},
};
class Monitor {
public:
explicit Monitor(std::string path, RocmSMI_env_vars const *e);
+118 -55
Féach ar an gComhad
@@ -45,9 +45,8 @@ CLOCK_JSON_VERSION = 1
headerString = ' ROCm System Management Interface '
footerString = ' End of ROCm SMI Log '
# Output formatting
appWidth = 100
appWidth = 90
deviceList = []
# Enable or disable serialized format
@@ -383,8 +382,8 @@ def getPidList():
return
def getPower(device, silent=False):
""" Return the current power level of a given device
def getAvgPower(device, silent=False):
""" Return the average power level of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
@@ -393,7 +392,21 @@ def getPower(device, silent=False):
power = c_uint32()
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
if rsmi_ret_ok(ret, device, 'get_power_avg', silent):
return power.value / 1000000
return str(power.value / 1000000)
return 'N/A'
def getCurrentSocketPower(device, silent=False):
""" Return the current (also known as instant)
socket power of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
power = c_uint32()
ret = rocmsmi.rsmi_dev_current_socket_power_get(device, byref(power))
if rsmi_ret_ok(ret, device, 'get_socket_power', silent):
return str(power.value / 1000000)
return 'N/A'
@@ -437,7 +450,7 @@ def findFirstAvailableTemp(device):
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret_temp = "N/A"
ret_temp_type = "(Unknown)"
ret_temp_type = temp_type_lst[0]
for i, templist_val in enumerate(temp_type_lst):
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
@@ -448,6 +461,37 @@ def findFirstAvailableTemp(device):
continue
return (ret_temp_type, ret_temp)
def getTemperatureLabel(deviceList):
""" Discovers the the first identified power label
Returns a string label value
@param device: DRM device identifier
"""
# Default label is Edge
tempLabel = temp_type_lst[0].lower()
if len(deviceList) < 1:
return tempLabel
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
tempLabel = temp_type.lower().replace('(', '').replace(')', '')
return tempLabel
def getPowerLabel(deviceList):
""" Discovers the the first identified power label
Returns a string label value
@param device: DRM device identifier
"""
power = c_int64(0)
# Default label is AvgPower
powerLabel = rsmi_power_label.AVG_POWER
if len(deviceList) < 1:
return powerLabel
device=deviceList[0]
power = getCurrentSocketPower(device, True)
if power != '0.0' and power != 'N/A':
powerLabel = rsmi_power_label.CURRENT_SOCKET_POWER
return powerLabel
def getVbiosVersion(device, silent=False):
""" Returns the VBIOS version for a given device
@@ -679,23 +723,35 @@ def printListLog(metricName, valuesList):
print(listStr + line)
def printLogSpacer(displayString=None, fill='='):
def printLogSpacer(displayString=None, fill='=', contentSizeToFit=0):
""" Prints [name of the option]/[name of the program] in the spacer to explain data below
If no parameters are given, a default fill of the '=' string is used in the spacer
@param displayString: name of item to be displayed inside of the log spacer
@param fill: padding string which surrounds the given display string
@param contentSizeToFit: providing an integer > 0 allows
ability to dynamically change output padding/fill based on this value
instead of appWidth. Handy for concise info output.
"""
global appWidth, PRINT_JSON
resizeValue = appWidth
if contentSizeToFit != 0:
resizeValue = contentSizeToFit
if resizeValue % 2: # if odd -> make even
resizeValue += 1
# leaving below to check if resizing works properly
# print("resizeVal=" +str(resizeValue) + "; appWidth=" + str(appWidth) +
# "; contentSizeToFit=" + str(contentSizeToFit) + "; fill=" + fill)
if not PRINT_JSON:
if displayString:
if len(displayString) % 2:
displayString += fill
logSpacer = fill * int((appWidth - (len(displayString))) / 2) + displayString + fill * int(
(appWidth - (len(displayString))) / 2)
logSpacer = fill * int((resizeValue - (len(displayString))) / 2) + displayString + fill * int(
(resizeValue - (len(displayString))) / 2)
else:
logSpacer = fill * appWidth
logSpacer = fill * resizeValue
print(logSpacer)
@@ -1630,22 +1686,15 @@ def showAllConcise(deviceList):
print('ERROR: Cannot print JSON/CSV output for concise output')
sys.exit(1)
""" Place holder for the actual max size """
MAX_ALL_CONCISE_WIDTH = 100
appWidth_temp = appWidth
appWidth = MAX_ALL_CONCISE_WIDTH
silent = True
printLogSpacer(' Concise Info ')
deviceList.sort()
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
available_temp_type = temp_type.lower()
available_temp_type = available_temp_type.replace('(', '')
available_temp_type = available_temp_type.replace(')', '')
header = ['GPU', '[Model : Revision]', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
subheader = ['', 'Name (20 chars)', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
available_temp_type = getTemperatureLabel(deviceList)
temp_type = "(" + available_temp_type.capitalize() + ")"
header=['Device', '[Model : Revision]', 'Temp', 'Power', 'Partitions',
'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
subheader = ['', 'Name (20 chars)', temp_type, getPowerLabel(deviceList),
'(Mem, Compute)', '', '', '', '', '', '', '']
# add additional spaces to match header
for idx, item in enumerate(subheader):
header_size = len(header[idx])
@@ -1667,11 +1716,17 @@ def showAllConcise(deviceList):
temp_val = str(getTemp(device, available_temp_type, silent))
if temp_val != 'N/A':
temp_val += degree_sign + 'C'
avgPwr = str(getPower(device))
if avgPwr != '0.0' and avgPwr != 'N/A':
socketPwr = getCurrentSocketPower(device, True)
avgPwr = getAvgPower(device, True)
powerVal = 'N/A'
if socketPwr != '0.0' and socketPwr != 'N/A':
socketPwr += 'W'
powerVal=socketPwr
elif avgPwr != '0.0' and avgPwr != 'N/A':
avgPwr += 'W'
powerVal=avgPwr
else:
avgPwr = 'N/A'
powerVal = 'N/A'
combined_partition = (getMemoryPartition(device, silent) + ", "
+ getComputePartition(device, silent))
sclk = showCurrentClocks([device], 'sclk', concise=silent)
@@ -1704,10 +1759,10 @@ def showAllConcise(deviceList):
'', '', '', '']
gpu_dev_product_info_top_name = gpu_dev_product_info_names[1]
values['card%s' % (str(device))] = [device, gpu_dev_product_info_top_name, temp_val, avgPwr,
combined_partition, sclk, mclk,
fan, str(perf).lower(), pwrCap,
mem_use_pct, gpu_busy]
values['card%s' % (str(device))] = [device, gpu_dev_product_info_top_name, temp_val,
powerVal, combined_partition, sclk, mclk,
fan, str(perf).lower(), pwrCap, mem_use_pct,
gpu_busy]
val_widths = {}
for device in deviceList:
@@ -1716,10 +1771,17 @@ def showAllConcise(deviceList):
for device in deviceList:
for col in range(len(val_widths[device])):
max_widths[col] = max(max_widths[col], val_widths[device][col])
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None)
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
None, useItalics=True)
printLogSpacer(fill='=')
########################
# Display concise info #
########################
header_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header))
subheader_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader))
printLogSpacer(headerString, contentSizeToFit=len(header_output))
printLogSpacer(' Concise Info ', contentSizeToFit=len(header_output))
printLog(None, header_output, None)
printLog(None, subheader_output, None, useItalics=True)
printLogSpacer(fill='=', contentSizeToFit=len(header_output))
for device in deviceList:
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
@@ -1730,9 +1792,8 @@ def showAllConcise(deviceList):
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
zip(range(len(max_widths)), values['card%s_Info' % (str(device))])), None)
printLogSpacer()
""" Restore original max size """
appWidth = appWidth_temp
printLogSpacer(contentSizeToFit=len(header_output))
printLogSpacer(footerString, contentSizeToFit=len(header_output))
def showAllConciseHw(deviceList):
@@ -2360,23 +2421,25 @@ def showPids(verbose):
def showPower(deviceList):
""" Display current Average Graphics Package Power Consumption for a list of devices
""" Display Current (also known as instant) Socket or Average
Graphics Package Power Consumption for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
"""
secondaryPresent=False
printLogSpacer(' Power Consumption ')
for device in deviceList:
if checkIfSecondaryDie(device):
if str(getCurrentSocketPower(device, True)) != 'N/A':
printLog(device, 'Current Socket Graphics Package Power (W)', getCurrentSocketPower(device))
elif checkIfSecondaryDie(device):
printLog(device, 'Average Graphics Package Power (W)', "N/A (Secondary die)")
secondaryPresent=True
elif str(getPower(device)) != '0.0':
printLog(device, 'Average Graphics Package Power (W)', getPower(device))
elif str(getAvgPower(device)) != '0.0':
printLog(device, 'Average Graphics Package Power (W)', getAvgPower(device))
else:
printErrLog(device, 'Unable to get Average Graphics Package Power Consumption')
printErrLog(device, 'Unable to get Average or Current Socket Graphics Package Power Consumption')
if secondaryPresent:
printLog(None, "\n\t\tPrimary die (usually one above or below the secondary) shows total (primary + secondary) socket power information", None)
printLogSpacer()
@@ -2872,13 +2935,8 @@ def getGraphColor(percentage):
def showTempGraph(deviceList):
deviceList.sort()
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
temp_type = temp_type.lower()
temp_type = temp_type.replace('(', '')
temp_type = temp_type.replace(')', '')
temp_type = getTemperatureLabel(deviceList)
printLogSpacer(' Temperature Graph ' + temp_type.capitalize() + ' ')
# Start a thread for constantly printing
try:
# Create a thread (call print function, devices, delay in ms)
@@ -3547,6 +3605,11 @@ def save(deviceList, savefilepath):
# The code below is for when this script is run as an executable instead of when imported as a module
def isConciseInfoRequested(args):
return len(sys.argv) == 1 or \
len(sys.argv) == 2 and (args.alldevices or (args.json or args.csv)) or \
len(sys.argv) == 3 and (args.alldevices and (args.json or args.csv))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='AMD ROCm System Management Interface | ROCM-SMI version: %s' % __version__,
@@ -3755,7 +3818,8 @@ if __name__ == '__main__':
if not PRINT_JSON:
print('\n')
printLogSpacer(headerString)
if not isConciseInfoRequested(args):
printLogSpacer(headerString)
if args.showallinfo:
args.list = True
@@ -3809,9 +3873,7 @@ if __name__ == '__main__':
if not checkAmdGpus(deviceList):
logging.warning('No AMD GPUs specified')
if len(sys.argv) == 1 or \
len(sys.argv) == 2 and (args.alldevices or (args.json or args.csv)) or \
len(sys.argv) == 3 and (args.alldevices and (args.json or args.csv)):
if isConciseInfoRequested(args):
showAllConcise(deviceList)
if args.showhw:
showAllConciseHw(deviceList)
@@ -4018,7 +4080,8 @@ if __name__ == '__main__':
devCsv = formatCsv(deviceList)
print(devCsv)
printLogSpacer(footerString)
if not isConciseInfoRequested(args):
printLogSpacer(footerString)
rsmi_ret_ok(rocmsmi.rsmi_shut_down())
exit(RETCODE)
@@ -655,3 +655,8 @@ rsmi_nps_mode_type = rsmi_nps_mode_type_t
# nps_mode_type_l[rsmi_nps_mode_type_t.RSMI_MEMORY_PARTITION_NPS2]
# will return string 'NPS2'
nps_mode_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8']
class rsmi_power_label(str, Enum):
AVG_POWER = '(Avg)'
CURRENT_SOCKET_POWER = '(Socket)'
+92 -17
Féach ar an gComhad
@@ -77,7 +77,6 @@
#include "rocm_smi/rocm_smi64Config.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
using namespace amd::smi;
static const uint32_t kMaxOverdriveLevel = 20;
@@ -2386,21 +2385,22 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid;
uint16_t val_ui16;
static const std::map<rsmi_temperature_metric_t, amd::smi::MonitorTypes> kMetricTypeMap = {
{ RSMI_TEMP_CURRENT, amd::smi::kMonTemp },
{ RSMI_TEMP_MAX, amd::smi::kMonTempMax },
{ RSMI_TEMP_MIN, amd::smi::kMonTempMin },
{ RSMI_TEMP_MAX_HYST, amd::smi::kMonTempMaxHyst },
{ RSMI_TEMP_MIN_HYST, amd::smi::kMonTempMinHyst },
{ RSMI_TEMP_CRITICAL, amd::smi::kMonTempCritical },
{ RSMI_TEMP_CRITICAL_HYST, amd::smi::kMonTempCriticalHyst },
{ RSMI_TEMP_EMERGENCY, amd::smi::kMonTempEmergency },
{ RSMI_TEMP_EMERGENCY_HYST, amd::smi::kMonTempEmergencyHyst },
{ RSMI_TEMP_CRIT_MIN, amd::smi::kMonTempCritMin },
{ RSMI_TEMP_CRIT_MIN_HYST, amd::smi::kMonTempCritMinHyst },
{ RSMI_TEMP_OFFSET, amd::smi::kMonTempOffset },
{ RSMI_TEMP_LOWEST, amd::smi::kMonTempLowest },
{ RSMI_TEMP_HIGHEST, amd::smi::kMonTempHighest },
static const std::map<rsmi_temperature_metric_t, amd::smi::MonitorTypes>
kMetricTypeMap = {
{ RSMI_TEMP_CURRENT, amd::smi::kMonTemp },
{ RSMI_TEMP_MAX, amd::smi::kMonTempMax },
{ RSMI_TEMP_MIN, amd::smi::kMonTempMin },
{ RSMI_TEMP_MAX_HYST, amd::smi::kMonTempMaxHyst },
{ RSMI_TEMP_MIN_HYST, amd::smi::kMonTempMinHyst },
{ RSMI_TEMP_CRITICAL, amd::smi::kMonTempCritical },
{ RSMI_TEMP_CRITICAL_HYST, amd::smi::kMonTempCriticalHyst },
{ RSMI_TEMP_EMERGENCY, amd::smi::kMonTempEmergency },
{ RSMI_TEMP_EMERGENCY_HYST, amd::smi::kMonTempEmergencyHyst },
{ RSMI_TEMP_CRIT_MIN, amd::smi::kMonTempCritMin },
{ RSMI_TEMP_CRIT_MIN_HYST, amd::smi::kMonTempCritMinHyst },
{ RSMI_TEMP_OFFSET, amd::smi::kMonTempOffset },
{ RSMI_TEMP_LOWEST, amd::smi::kMonTempLowest },
{ RSMI_TEMP_HIGHEST, amd::smi::kMonTempHighest },
};
const auto mon_type_it = kMetricTypeMap.find(metric);
@@ -2485,7 +2485,8 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
return RSMI_STATUS_NOT_SUPPORTED;
}
*temperature = static_cast<int64_t>(val_ui16) * CENTRIGRADE_TO_MILLI_CENTIGRADE;
*temperature =
static_cast<int64_t>(val_ui16) * CENTRIGRADE_TO_MILLI_CENTIGRADE;
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
@@ -2815,6 +2816,80 @@ rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) {
CATCH
}
rsmi_status_t
rsmi_dev_current_socket_power_get(uint32_t dv_ind, uint64_t *socket_power) {
TRY
std::ostringstream ss;
rsmi_status_t rsmiReturn = RSMI_STATUS_NOT_SUPPORTED;
std::string val_str;
uint32_t sensor_ind = 1; // socket_power sysfs files have 1-based indices
MonitorTypes mon_type = amd::smi::kMonPowerInput;
ss << __PRETTY_FUNCTION__ << " | ======= start =======, dv_ind="
<< std::to_string(dv_ind);
LOG_TRACE(ss);
if (socket_power == nullptr) {
rsmiReturn = RSMI_STATUS_INVALID_ARGS;
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: socket_power was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(rsmiReturn) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
CHK_SUPPORT_SUBVAR_ONLY(socket_power, sensor_ind)
DEVICE_MUTEX
if (dev->monitor() == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: hwmon monitor was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(rsmiReturn) << " |";
LOG_ERROR(ss);
return rsmiReturn;
}
int ret = dev->monitor()->readMonitor(amd::smi::kMonPowerLabel,
sensor_ind, &val_str);
if (ret || val_str != "PPT" || val_str.size() != 3) {
if (ret != 0) {
rsmiReturn = amd::smi::ErrnoToRsmiStatus(ret);
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: readMonitor() returned an error status"
<< " or Socket Power label did not show PPT or size of label data was"
<< " unexpected"
<< " | Returning = "
<< getRSMIStatusString(rsmiReturn) << " |";
LOG_ERROR(ss);
return rsmiReturn;
}
rsmiReturn = get_dev_mon_value(mon_type, dv_ind, sensor_ind,
socket_power);
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Data: " << *socket_power
<< " | Returning = "
<< getRSMIStatusString(rsmiReturn) << " |";
LOG_TRACE(ss);
return rsmiReturn;
CATCH
}
rsmi_status_t
rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power,
float *counter_resolution, uint64_t *timestamp) {
-2
Féach ar an gComhad
@@ -68,8 +68,6 @@
#include "rocm_smi/rocm_smi_logger.h"
#include "shared_mutex.h" // NOLINT
using namespace ROCmLogging;
namespace amd {
namespace smi {
@@ -61,7 +61,6 @@
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
using namespace amd::smi;
#define TRY try {
+56 -55
Féach ar an gComhad
@@ -71,9 +71,8 @@
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_main.h"
using namespace ROCmLogging;
Logger* Logger::m_Instance = nullptr;
ROCmLogging::Logger *ROCmLogging::Logger::m_Instance = nullptr;
// Log file name
// WARNING: File name should be changed here and
@@ -81,39 +80,39 @@ Logger* Logger::m_Instance = nullptr;
// in one place will cause a mismatch in these scripts,
// files may not have proper permissions, and logrotate
// would not function properly.
const std::string logPath = "/var/log/rocm_smi_lib/";
const std::string logBaseFName = "ROCm-SMI-lib";
const std::string logExtension = ".log";
const std::string logFileName = logPath + logBaseFName + logExtension;
#define LOGPATH "/var/log/rocm_smi_lib/"
#define LOGBASE_FNAME "ROCm-SMI-lib"
#define LOGEXTENSION ".log"
const char *logFileName = LOGPATH LOGBASE_FNAME LOGEXTENSION;
Logger::Logger() {
ROCmLogging::Logger::Logger() {
initialize_resources();
}
Logger::~Logger() {
ROCmLogging::Logger::~Logger() {
if (m_loggingIsOn) {
destroy_resources();
}
}
Logger* Logger::getInstance() throw() {
ROCmLogging::Logger* ROCmLogging::Logger::getInstance() throw() {
if (m_Instance == nullptr) {
m_Instance = new Logger();
m_Instance = new ROCmLogging::Logger();
}
return m_Instance;
}
void Logger::lock() {
void ROCmLogging::Logger::lock() {
m_Lock.lock();
}
void Logger::unlock() {
void ROCmLogging::Logger::unlock() {
m_Lock.unlock();
}
void Logger::logIntoFile(std::string& data) {
void ROCmLogging::Logger::logIntoFile(std::string& data) {
lock();
if(!m_File.is_open()) {
if (!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessful."
@@ -127,24 +126,24 @@ void Logger::logIntoFile(std::string& data) {
unlock();
}
void Logger::logOnConsole(std::string& data) {
void ROCmLogging::Logger::logOnConsole(std::string& data) {
std::cout << getCurrentTime() << " " << data << std::endl;
}
// Returns: In string format, YY-MM-DD HH:MM:SS.microseconds
std::string Logger::getCurrentTime(void) {
using namespace std::chrono;
std::string ROCmLogging::Logger::getCurrentTime(void) {
std::string currentTime;
// get current time
auto now = system_clock::now();
auto now = std::chrono::system_clock::now();
// get number of milliseconds for the current second
// (remainder after division into seconds)
auto ms = duration_cast<microseconds>(now.time_since_epoch()) % 1000000;
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(
now.time_since_epoch()) % 1000000;
// convert to std::time_t in order to convert to std::tm (broken time)
auto timer = system_clock::to_time_t(now);
auto timer = std::chrono::system_clock::to_time_t(now);
// convert to broken time
std::tm bt = *std::localtime(&timer);
@@ -159,7 +158,7 @@ std::string Logger::getCurrentTime(void) {
}
// Interface for Error Log
void Logger::error(const char* text) throw() {
void ROCmLogging::Logger::error(const char* text) throw() {
// By default, logging is disabled
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -182,18 +181,18 @@ void Logger::error(const char* text) throw() {
}
}
void Logger::error(std::string& text) throw() {
void ROCmLogging::Logger::error(std::string& text) throw() {
error(text.data());
}
void Logger::error(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::error(std::ostringstream& stream) throw() {
std::string text = stream.str();
error(text.data());
stream.str("");
}
// Interface for Alarm Log
void Logger::alarm(const char* text) throw() {
void ROCmLogging::Logger::alarm(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -216,18 +215,18 @@ void Logger::alarm(const char* text) throw() {
}
}
void Logger::alarm(std::string& text) throw() {
void ROCmLogging::Logger::alarm(std::string& text) throw() {
alarm(text.data());
}
void Logger::alarm(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::alarm(std::ostringstream& stream) throw() {
std::string text = stream.str();
alarm(text.data());
stream.str("");
}
// Interface for Always Log
void Logger::always(const char* text) throw() {
void ROCmLogging::Logger::always(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -250,18 +249,18 @@ void Logger::always(const char* text) throw() {
}
}
void Logger::always(std::string& text) throw() {
void ROCmLogging::Logger::always(std::string& text) throw() {
always(text.data());
}
void Logger::always(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::always(std::ostringstream& stream) throw() {
std::string text = stream.str();
always(text.data());
stream.str("");
}
// Interface for Buffer Log
void Logger::buffer(const char* text) throw() {
void ROCmLogging::Logger::buffer(const char* text) throw() {
// Buffer is the special case. So don't add log level
// and timestamp in the buffer message. Just log the raw bytes.
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_BUFFER)) {
@@ -284,18 +283,18 @@ void Logger::buffer(const char* text) throw() {
}
}
void Logger::buffer(std::string& text) throw() {
void ROCmLogging::Logger::buffer(std::string& text) throw() {
buffer(text.data());
}
void Logger::buffer(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::buffer(std::ostringstream& stream) throw() {
std::string text = stream.str();
buffer(text.data());
stream.str("");
}
// Interface for Info Log
void Logger::info(const char* text) throw() {
void ROCmLogging::Logger::info(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -318,18 +317,18 @@ void Logger::info(const char* text) throw() {
}
}
void Logger::info(std::string& text) throw() {
void ROCmLogging::Logger::info(std::string& text) throw() {
info(text.data());
}
void Logger::info(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::info(std::ostringstream& stream) throw() {
std::string text = stream.str();
info(text.data());
stream.str("");
}
// Interface for Trace Log
void Logger::trace(const char* text) throw() {
void ROCmLogging::Logger::trace(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -352,18 +351,18 @@ void Logger::trace(const char* text) throw() {
}
}
void Logger::trace(std::string& text) throw() {
void ROCmLogging::Logger::trace(std::string& text) throw() {
trace(text.data());
}
void Logger::trace(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::trace(std::ostringstream& stream) throw() {
std::string text = stream.str();
trace(text.data());
stream.str("");
}
// Interface for Debug Log
void Logger::debug(const char* text) throw() {
void ROCmLogging::Logger::debug(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -386,51 +385,53 @@ void Logger::debug(const char* text) throw() {
}
}
void Logger::debug(std::string& text) throw() {
void ROCmLogging::Logger::debug(std::string& text) throw() {
debug(text.data());
}
void Logger::debug(std::ostringstream& stream) throw() {
void ROCmLogging::Logger::debug(std::ostringstream& stream) throw() {
std::string text = stream.str();
debug(text.data());
stream.str("");
}
// Interfaces to control log levels
void Logger::updateLogLevel(LogLevel logLevel) {
void ROCmLogging::Logger::updateLogLevel(LogLevel logLevel) {
m_LogLevel = logLevel;
}
void Logger::enableAllLogLevels() {
void ROCmLogging::Logger::enableAllLogLevels() {
m_LogLevel = ENABLE_LOG;
}
// Disable all log levels, except error and alarm
void Logger::disableLog() {
void ROCmLogging::Logger::disableLog() {
m_LogLevel = DISABLE_LOG;
}
// Interfaces to control log Types
void Logger::updateLogType(LogType logType) {
void ROCmLogging::Logger::updateLogType(LogType logType) {
m_LogType = logType;
}
void Logger::enableConsoleLogging() {
void ROCmLogging::Logger::enableConsoleLogging() {
m_LogType = CONSOLE;
}
void Logger::enableFileLogging() {
void ROCmLogging::Logger::enableFileLogging() {
m_LogType = FILE_LOG;
}
// Returns a string of details on current log settings
std::string Logger::getLogSettings() {
std::string ROCmLogging::Logger::getLogSettings() {
std::string logSettings;
if (m_File.is_open()) {
logSettings += "OpenStatus = File (" + logFileName + ") is open";
logSettings += "OpenStatus = File (" + std::string(logFileName)
+ ") is open";
} else {
logSettings += "OpenStatus = File (" + logFileName + ") is not open";
logSettings += "OpenStatus = File (" + std::string(logFileName)
+ ") is not open";
}
logSettings += ", ";
@@ -480,11 +481,11 @@ std::string Logger::getLogSettings() {
// Returns current reported enabled logging state. State is controlled by
// user's environment variable RSMI_LOGGING.
bool Logger::isLoggerEnabled() {
bool ROCmLogging::Logger::isLoggerEnabled() {
return m_loggingIsOn;
}
void Logger::initialize_resources() {
void ROCmLogging::Logger::initialize_resources() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
@@ -492,7 +493,7 @@ void Logger::initialize_resources() {
if (!m_loggingIsOn) {
return;
}
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
m_File.open(logFileName, std::ios::out | std::ios::app);
m_LogLevel = LOG_LEVEL_TRACE;
// RSMI_LOGGING = 1, output to logs only
// RSMI_LOGGING = 2, output to console only
@@ -521,9 +522,9 @@ void Logger::initialize_resources() {
if (m_File.fail()) {
std::cout << "WARNING: Failed opening log file." << std::endl;
}
chmod(logFileName.c_str(), S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH);
chmod(logFileName, S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH);
}
void Logger::destroy_resources() {
void ROCmLogging::Logger::destroy_resources() {
m_File.close();
}
+2 -3
Féach ar an gComhad
@@ -68,7 +68,6 @@
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
static const char *kPathDRMRoot = "/sys/class/drm";
static const char *kPathHWMonRoot = "/sys/class/hwmon";
@@ -314,12 +313,12 @@ RocmSMI::Initialize(uint64_t flags) {
int i_ret;
LOG_ALWAYS("=============== ROCM SMI initialize ================");
Logger::getInstance()->enableAllLogLevels();
ROCmLogging::Logger::getInstance()->enableAllLogLevels();
// Leaving below to allow developers to check current log settings
// std::string logSettings = Logger::getInstance()->getLogSettings();
// std::cout << "Current log settings:\n" << logSettings << std::endl;
if (Logger::getInstance()->isLoggerEnabled()) {
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
logSystemDetails();
}
+8 -5
Féach ar an gComhad
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -58,8 +58,6 @@
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
namespace amd {
namespace smi {
@@ -80,6 +78,8 @@ static const char *kMonPowerCapName = "power#_cap";
static const char *kMonPowerCapMaxName = "power#_cap_max";
static const char *kMonPowerCapMinName = "power#_cap_min";
static const char *kMonPowerAveName = "power#_average";
static const char *kMonPowerInputName = "power#_input";
static const char *kMonPowerLabelName = "power#_label";
static const char *kMonTempMaxName = "temp#_max";
static const char *kMonTempMinName = "temp#_min";
static const char *kMonTempMaxHystName = "temp#_max_hyst";
@@ -135,6 +135,8 @@ static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
{kMonPowerCapMax, kMonPowerCapMaxName},
{kMonPowerCapMin, kMonPowerCapMinName},
{kMonPowerAve, kMonPowerAveName},
{kMonPowerInput, kMonPowerInputName},
{kMonPowerLabel, kMonPowerLabelName},
{kMonTempMax, kMonTempMaxName},
{kMonTempMin, kMonTempMinName},
{kMonTempMaxHyst, kMonTempMaxHystName},
@@ -202,7 +204,8 @@ static const std::map<const char *, monitor_depends_t> kMonFuncDependsMap = {
.variants = {kMonInvalid},
}
},
{"rsmi_dev_power_cap_default_get", { .mandatory_depends = {kMonPowerCapDefaultName},
{"rsmi_dev_power_cap_default_get", { .mandatory_depends =
{kMonPowerCapDefaultName},
.variants = {kMonInvalid},
}
},
@@ -613,7 +616,7 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
supported_monitors = intersect;
}
if (!supported_monitors.empty()) {
for (unsigned long & supported_monitor : supported_monitors) {
for (uint64_t &supported_monitor : supported_monitors) {
if (m_type == eDefaultMonitor) {
assert(supported_monitor > 0);
supported_monitor |=
-1
Féach ar an gComhad
@@ -73,7 +73,6 @@
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
namespace amd {
namespace smi {
@@ -53,6 +53,7 @@
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/power_read.h"
#include "rocm_smi_test/test_common.h"
#include "rocm_smi/rocm_smi_utils.h"
TestPowerRead::TestPowerRead() : TestBase() {
set_title("RSMI Power Read Test");
@@ -116,27 +117,48 @@ void TestPowerRead::Run(void) {
val_ui64 << " uW" << std::endl;
}
/* Average Power */
err = rsmi_dev_power_ave_get(i, 0, &val_ui64);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Power average information is not supported for this device"
"\t**Average Power Usage: not supported on this device"
<< std::endl;
// Verify api support checking functionality is working
err = rsmi_dev_power_ave_get(i, 0, nullptr);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
continue;
}
IF_VERB(STANDARD) {
std::cout << "\t**Average Power Usage: ";
CHK_RSMI_PERM_ERR(err)
if (err == RSMI_STATUS_SUCCESS) {
std::cout << static_cast<float>(val_ui64)/1000 << " mW" << std::endl;
} else {
IF_VERB(STANDARD) {
std::cout << "\t**Average Power Usage: ";
CHK_RSMI_PERM_ERR(err)
if (err == RSMI_STATUS_SUCCESS) {
std::cout << static_cast<float>(val_ui64) / 1000 << " mW"
<< std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_power_ave_get(i, 0, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
// Verify api support checking functionality is working
err = rsmi_dev_power_ave_get(i, 0, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
/* Current Socket Power */
err = rsmi_dev_current_socket_power_get(i, &val_ui64);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Current Socket Power: not supported"
" on this device" << std::endl;
} else {
IF_VERB(STANDARD) {
std::cout << "\t**Current Socket Power: ";
CHK_RSMI_PERM_ERR(err)
if (err == RSMI_STATUS_SUCCESS) {
std::cout << static_cast<float>(val_ui64) / 1000 << " mW"
<< std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_current_socket_power_get(i, nullptr);
// std::cout << "err = " << amd::smi::getRSMIStatusString(err);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
}
std::cout << "\n";
}
}
}