[SWDEV-399953] Smart Temperature detection + partitioning display

* Updates:
    - Fix for devices which do not have edge sensors, but junction
    - Added partitioning (memory and dynamic) displays for
      base rocm-smi CLI calls
    - Added subheading for base rocm-smi call output
    - Added better hwmon and device detection logging

Change-Id: I8219884b2e532d6ed379527cacdc1f2b232a5451
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: 755e14dbad]
Cette révision appartient à :
Charis Poag
2023-08-10 18:25:02 -05:00
Parent a75b7f741c
révision 47420111a8
7 fichiers modifiés avec 258 ajouts et 29 suppressions
+38
Voir le fichier
@@ -94,6 +94,44 @@ enum MonitorTypes {
kMonInvalid = 0xFFFFFFFF,
};
const std::map<MonitorTypes,std::string> monitorTypesToString {
{MonitorTypes::kMonName, "amd::smi::kMonName"},
{MonitorTypes::kMonTemp, "amd::smi::kMonName"},
{MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"},
{MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCap, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerAve, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMax, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritical, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempOffset, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonVolt, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMax, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMin, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonInvalid, "amd::smi::kMonName"},
};
class Monitor {
public:
+2
Voir le fichier
@@ -87,6 +87,8 @@ std::tuple<bool, std::string> readTmpFile(
std::string stateName,
std::string parameterName);
void displayAppTmpFilesContent(void);
std::string debugVectorContent(std::vector<std::string> v);
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v);
rsmi_status_t handleException();
rsmi_status_t
GetDevValueVec(amd::smi::DevInfoTypes type,
+72 -13
Voir le fichier
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
footerString = ' End of ROCm SMI Log '
# Output formatting
appWidth = 84
appWidth = 100
deviceList = []
# Enable or disable serialized format
@@ -393,6 +393,25 @@ def getTemp(device, sensor):
return temp.value / 1000
return 'N/A'
def findFirstAvailableTemp(device):
""" Discovers the first available device temperature to display
Returns a tuple of (temp_type, temp_value) for the device specified
@param device: DRM device identifier
"""
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret_temp = "N/A"
ret_temp_type = "(Unknown)"
for i, templist_val in enumerate(temp_type_lst):
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
ret_temp = temp.value / 1000
ret_temp_type = '(' + templist_val.capitalize() + ')'
break
else:
continue
return (ret_temp_type, ret_temp)
def getVbiosVersion(device):
""" Returns the VBIOS version for a given device
@@ -429,7 +448,7 @@ def getComputePartition(device):
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
return str(currentComputePartition.value.decode())
return "UNKNOWN"
return "N/A"
def getMemoryPartition(device):
@@ -441,7 +460,7 @@ def getMemoryPartition(device):
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
return str(currentNPSMode.value.decode())
return "UNKNOWN"
return "N/A"
def print2DArray(dataArray):
@@ -544,13 +563,20 @@ def printEventList(device, delay, eventList):
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value=None, extraSpace=False):
def printLog(device, metricName, value=None, extraSpace=False, useItalics=False):
""" Print out to the SMI log
@param device: DRM device identifier
@param metricName: Title of the item to print to the log
@param value: The item's value to print to the log
"""
red = '\033[91m'
green = '\033[92m'
blue = '\033[94m'
bold = '\033[1m'
italics = '\033[3m'
underline = '\033[4m'
end = '\033[0m'
global PRINT_JSON
if PRINT_JSON:
if value is not None and device is not None:
@@ -567,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False):
# Force thread safe printing
lock = multiprocessing.Lock()
lock.acquire()
if useItalics:
logstr = italics + logstr + end
if extraSpace:
print('\n' + logstr + '\n', end='', flush=True)
else:
@@ -1544,18 +1572,39 @@ def showAllConcise(deviceList):
print('ERROR: Cannot print JSON/CSV output for concise output')
sys.exit(1)
printLogSpacer(' Concise Info ')
header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
deviceList.sort()
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
available_temp_type = temp_type.lower()
available_temp_type = available_temp_type.replace('(', '')
available_temp_type = available_temp_type.replace(')', '')
header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
# add additional spaces to match header
for idx, item in enumerate(subheader):
header_size = len(header[idx])
subheader_size = len(subheader[idx])
if header_size != subheader_size:
numSpacesToFill_subheader = header_size - subheader_size
numSpacesToFill_header = subheader_size - header_size
#take pos spaces to mean, we need to match size of the other
if numSpacesToFill_subheader > 0:
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
if numSpacesToFill_header > 0:
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
head_widths = [len(head) + 2 for head in header]
values = {}
degree_sign = u'\N{DEGREE SIGN}'
for device in deviceList:
temp = str(getTemp(device, 'edge'))
if temp != 'N/A':
temp += 'c'
temp_val = str(getTemp(device, available_temp_type))
if temp_val != 'N/A':
temp_val += degree_sign + 'C'
avgPwr = str(getPower(device))
if avgPwr != '0.0' and avgPwr != 'N/A':
avgPwr += 'W'
else:
avgPwr = 'N/A'
combined_partition = (getMemoryPartition(device) + ", "
+ getComputePartition(device))
concise = True
sclk = showCurrentClocks([device], 'sclk', concise)
mclk = showCurrentClocks([device], 'mclk', concise)
@@ -1579,7 +1628,9 @@ def showAllConcise(deviceList):
mem_use_pct='Unsupported'
if vram_used != None and vram_total != None and float(vram_total) != 0:
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
values['card%s' % (str(device))] = [device, temp_val, avgPwr,
combined_partition, sclk, mclk,
fan, str(perf).lower(), pwrCap,
mem_use_pct, gpu_busy]
val_widths = {}
for device in deviceList:
@@ -1589,6 +1640,9 @@ def showAllConcise(deviceList):
for col in range(len(val_widths[device])):
max_widths[col] = max(max_widths[col], val_widths[device][col])
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None)
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
None, useItalics=True)
printLogSpacer(fill='=')
for device in deviceList:
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
@@ -2548,7 +2602,7 @@ def showEvents(deviceList, eventTypes):
break
def printTempGraph(deviceList, delay):
def printTempGraph(deviceList, delay, temp_type):
# deviceList must be in ascending order
deviceList.sort()
devices = 0
@@ -2562,7 +2616,7 @@ def printTempGraph(deviceList, delay):
terminalWidth = os.get_terminal_size()[0]
printStrings = list()
for device in deviceList:
temp = getTemp(device, 'edge')
temp = getTemp(device, temp_type)
if temp == 'N/A':
percentage = 0
else:
@@ -2635,11 +2689,16 @@ def getGraphColor(percentage):
def showTempGraph(deviceList):
printLogSpacer(' Temperature Graph ')
deviceList.sort()
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
temp_type = temp_type.lower()
temp_type = temp_type.replace('(', '')
temp_type = temp_type.replace(')', '')
# Start a thread for constantly printing
try:
# Create a thread (call print function, devices, delay in ms)
_thread.start_new_thread(printTempGraph, (deviceList, 150))
_thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type))
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
# Catch user input for program termination
+68 -3
Voir le fichier
@@ -78,6 +78,7 @@
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
using namespace amd::smi;
static const uint32_t kMaxOverdriveLevel = 20;
static const float kEnergyCounterResolution = 15.3f;
@@ -2475,7 +2476,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
}
if (temperature == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: temperature was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
// The HBM temperature is retreived from the gpu_metrics
@@ -2484,12 +2494,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|| sensor_type == RSMI_TEMP_TYPE_HBM_2
|| sensor_type == RSMI_TEMP_TYPE_HBM_3) {
if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: To retreive HBM temp, we only support metric = "
<< "RSMI_TEMP_CURRENT"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
rsmi_gpu_metrics_t gpu_metrics;
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
<< getRSMIStatusString(ret)
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_ERROR(ss);
return ret;
}
@@ -2509,11 +2539,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
default:
return RSMI_STATUS_INVALID_ARGS;
}
if (val_ui16 == UINT16_MAX)
if (val_ui16 == UINT16_MAX) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: Reached UINT16 max value, overflow"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
else
} else
*temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE;
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Data: " << *temperature
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | ";
LOG_INFO(ss);
return RSMI_STATUS_SUCCESS;
} // end HBM temperature
@@ -2522,6 +2569,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
GET_DEV_FROM_INDX
if (dev->monitor() == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: monitor returned nullptr"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
std::shared_ptr<amd::smi::Monitor> m = dev->monitor();
@@ -2535,6 +2591,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index)
ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature);
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Sensor_index: " << sensor_index
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Data: " << *temperature
<< " | Returning = "
<< getRSMIStatusString(ret) << " | ";
LOG_INFO(ss);
return ret;
CATCH
+37 -4
Voir le fichier
@@ -170,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) {
// computed for cardX.
// On success, return drm_minor which is >= 128 otherwise return 0
static uint32_t GetDrmRenderMinor(const std::string s) {
std::ostringstream ss;
std::string drm_path = s;
int drm_minor = 0;
const std::string render_file_prefix = "renderD";
@@ -195,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
if (closedir(drm_dir)) {
return 0;
}
ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = "
<< std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | ";
LOG_DEBUG(ss);
return static_cast<uint32_t>(drm_minor);
}
@@ -377,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) {
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
std::ostringstream ss;
auto dev_iter = devices_.begin();
while (dev_iter != devices_.end()) {
uint64_t bdfid = (*dev_iter)->bdfid();
if (tmp_map.find(bdfid) == tmp_map.end()) {
ss << __PRETTY_FUNCTION__ << " | removing device = "
<< (*dev_iter)->path();
dev_iter = devices_.erase(dev_iter);
LOG_DEBUG(ss);
continue;
}
dev_iter++;
@@ -411,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) {
}
// Leaving below to help debug temp file issues
// displayAppTmpFilesContent();
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
LOG_DEBUG(ss);
}
void
@@ -646,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) {
}
void
RocmSMI::AddToDeviceList(std::string dev_name) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
auto dev_path = std::string(kPathDRMRoot);
dev_path += "/";
dev_path += dev_name;
@@ -662,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
devices_.push_back(dev);
ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = "
<< dev_name << " | path = " << dev_path
<< " | card index = " << std::to_string(card_indx) << " | ";
LOG_DEBUG(ss);
return;
}
@@ -669,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
static const uint32_t kAmdGpuId = 0x1002;
static bool isAMDGPU(std::string dev_path) {
bool isAmdGpu = false;
std::ostringstream ss;
std::string vend_path = dev_path + "/device/vendor";
if (!FileExists(vend_path.c_str())) {
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
std::ifstream fs;
fs.open(vend_path);
if (!fs.is_open()) {
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
uint32_t vendor_id;
@@ -688,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) {
fs.close();
if (vendor_id == kAmdGpuId) {
return true;
isAmdGpu = true;
}
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
+11
Voir le fichier
@@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id,
// This string version should work for all valid monitor types
int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
std::string *val) {
std::ostringstream ss;
assert(val != nullptr);
std::string temp_str;
@@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr)
int ret = ReadSysfsStr(sysfs_path, val);
ss << __PRETTY_FUNCTION__
<< " | Success | Read hwmon file: " << sysfs_path
<< " | Type: " << monitorTypesToString.at(type)
<< " | Sensor id: " << std::to_string(sensor_id)
<< " | Data: " << *val
<< " | Returning: " << std::to_string(ret) << " |";
LOG_INFO(ss);
return ret;
}
int32_t
Monitor::setTempSensorLabelMap(void) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
std::string type_str;
int ret;
+30 -9
Voir le fichier
@@ -204,9 +204,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
if (!fs.is_open()) {
ret = errno;
errno = 0;
oss << "Could not read SYSFS file (" << path << ")"
<< ", returning " << std::to_string(ret) << " ("
<< std::strerror(ret) << ")";
oss << __PRETTY_FUNCTION__
<< " | Fail | Cause: file does not exist or permissions issue"
<< " | SYSFS file: " << path
<< " | Returning: " << std::strerror(ret) << " |";
LOG_ERROR(oss);
return ret;
}
@@ -516,19 +517,39 @@ void displayAppTmpFilesContent() {
}
// Used to debug vector string list and their content
void displayVectorContent(std::vector<std::string> v) {
std::cout << "Vector = {";
std::string debugVectorContent(std::vector<std::string> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
for (auto it=v.begin(); it < v.end(); it++) {
std::cout << *it;
ss << *it;
auto temp_it = it;
if(++temp_it != v.end()) {
std::cout << ", ";
ss << ", ";
}
}
} else {
std::cout << "}" << std::endl;
}
ss << "}" << std::endl;
return ss.str();
}
// Used to debug vector string list and their content
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << (*it)->path();
auto temp_it = it;
if(++temp_it != v.end()) {
ss << ", ";
}
}
}
ss << "}" << std::endl;
return ss.str();
}
// Attempts to read application specific temporary file