[SWDEV-399953] Smart Temperature detection + partitioning display
* Updates:
- Fix for devices which do not have edge sensors, but junction
- Added partitioning (memory and dynamic) displays for
base rocm-smi CLI calls
- Added subheading for base rocm-smi call output
- Added better hwmon and device detection logging
Change-Id: I8219884b2e532d6ed379527cacdc1f2b232a5451
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/rocm_smi_lib commit: 755e14dbad]
Cette révision appartient à :
@@ -94,6 +94,44 @@ enum MonitorTypes {
|
||||
kMonInvalid = 0xFFFFFFFF,
|
||||
};
|
||||
|
||||
const std::map<MonitorTypes,std::string> monitorTypesToString {
|
||||
{MonitorTypes::kMonName, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTemp, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCap, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerAve, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritical, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempOffset, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempLowest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempHighest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempLabel, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVolt, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonInvalid, "amd::smi::kMonName"},
|
||||
};
|
||||
|
||||
|
||||
class Monitor {
|
||||
public:
|
||||
|
||||
@@ -87,6 +87,8 @@ std::tuple<bool, std::string> readTmpFile(
|
||||
std::string stateName,
|
||||
std::string parameterName);
|
||||
void displayAppTmpFilesContent(void);
|
||||
std::string debugVectorContent(std::vector<std::string> v);
|
||||
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v);
|
||||
rsmi_status_t handleException();
|
||||
rsmi_status_t
|
||||
GetDevValueVec(amd::smi::DevInfoTypes type,
|
||||
|
||||
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
|
||||
footerString = ' End of ROCm SMI Log '
|
||||
|
||||
# Output formatting
|
||||
appWidth = 84
|
||||
appWidth = 100
|
||||
deviceList = []
|
||||
|
||||
# Enable or disable serialized format
|
||||
@@ -393,6 +393,25 @@ def getTemp(device, sensor):
|
||||
return temp.value / 1000
|
||||
return 'N/A'
|
||||
|
||||
def findFirstAvailableTemp(device):
|
||||
""" Discovers the first available device temperature to display
|
||||
|
||||
Returns a tuple of (temp_type, temp_value) for the device specified
|
||||
@param device: DRM device identifier
|
||||
"""
|
||||
temp = c_int64(0)
|
||||
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
||||
ret_temp = "N/A"
|
||||
ret_temp_type = "(Unknown)"
|
||||
for i, templist_val in enumerate(temp_type_lst):
|
||||
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
|
||||
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
|
||||
ret_temp = temp.value / 1000
|
||||
ret_temp_type = '(' + templist_val.capitalize() + ')'
|
||||
break
|
||||
else:
|
||||
continue
|
||||
return (ret_temp_type, ret_temp)
|
||||
|
||||
def getVbiosVersion(device):
|
||||
""" Returns the VBIOS version for a given device
|
||||
@@ -429,7 +448,7 @@ def getComputePartition(device):
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
||||
return str(currentComputePartition.value.decode())
|
||||
return "UNKNOWN"
|
||||
return "N/A"
|
||||
|
||||
|
||||
def getMemoryPartition(device):
|
||||
@@ -441,7 +460,7 @@ def getMemoryPartition(device):
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
|
||||
return str(currentNPSMode.value.decode())
|
||||
return "UNKNOWN"
|
||||
return "N/A"
|
||||
|
||||
|
||||
def print2DArray(dataArray):
|
||||
@@ -544,13 +563,20 @@ def printEventList(device, delay, eventList):
|
||||
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
|
||||
data.message.decode('utf8') + '\r']])
|
||||
|
||||
def printLog(device, metricName, value=None, extraSpace=False):
|
||||
def printLog(device, metricName, value=None, extraSpace=False, useItalics=False):
|
||||
""" Print out to the SMI log
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param metricName: Title of the item to print to the log
|
||||
@param value: The item's value to print to the log
|
||||
"""
|
||||
red = '\033[91m'
|
||||
green = '\033[92m'
|
||||
blue = '\033[94m'
|
||||
bold = '\033[1m'
|
||||
italics = '\033[3m'
|
||||
underline = '\033[4m'
|
||||
end = '\033[0m'
|
||||
global PRINT_JSON
|
||||
if PRINT_JSON:
|
||||
if value is not None and device is not None:
|
||||
@@ -567,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False):
|
||||
# Force thread safe printing
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
if useItalics:
|
||||
logstr = italics + logstr + end
|
||||
if extraSpace:
|
||||
print('\n' + logstr + '\n', end='', flush=True)
|
||||
else:
|
||||
@@ -1544,18 +1572,39 @@ def showAllConcise(deviceList):
|
||||
print('ERROR: Cannot print JSON/CSV output for concise output')
|
||||
sys.exit(1)
|
||||
printLogSpacer(' Concise Info ')
|
||||
header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
deviceList.sort()
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
available_temp_type = temp_type.lower()
|
||||
available_temp_type = available_temp_type.replace('(', '')
|
||||
available_temp_type = available_temp_type.replace(')', '')
|
||||
header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
|
||||
# add additional spaces to match header
|
||||
for idx, item in enumerate(subheader):
|
||||
header_size = len(header[idx])
|
||||
subheader_size = len(subheader[idx])
|
||||
if header_size != subheader_size:
|
||||
numSpacesToFill_subheader = header_size - subheader_size
|
||||
numSpacesToFill_header = subheader_size - header_size
|
||||
#take pos spaces to mean, we need to match size of the other
|
||||
if numSpacesToFill_subheader > 0:
|
||||
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
|
||||
if numSpacesToFill_header > 0:
|
||||
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
|
||||
head_widths = [len(head) + 2 for head in header]
|
||||
values = {}
|
||||
degree_sign = u'\N{DEGREE SIGN}'
|
||||
for device in deviceList:
|
||||
temp = str(getTemp(device, 'edge'))
|
||||
if temp != 'N/A':
|
||||
temp += 'c'
|
||||
temp_val = str(getTemp(device, available_temp_type))
|
||||
if temp_val != 'N/A':
|
||||
temp_val += degree_sign + 'C'
|
||||
avgPwr = str(getPower(device))
|
||||
if avgPwr != '0.0' and avgPwr != 'N/A':
|
||||
avgPwr += 'W'
|
||||
else:
|
||||
avgPwr = 'N/A'
|
||||
combined_partition = (getMemoryPartition(device) + ", "
|
||||
+ getComputePartition(device))
|
||||
concise = True
|
||||
sclk = showCurrentClocks([device], 'sclk', concise)
|
||||
mclk = showCurrentClocks([device], 'mclk', concise)
|
||||
@@ -1579,7 +1628,9 @@ def showAllConcise(deviceList):
|
||||
mem_use_pct='Unsupported'
|
||||
if vram_used != None and vram_total != None and float(vram_total) != 0:
|
||||
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
|
||||
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
|
||||
values['card%s' % (str(device))] = [device, temp_val, avgPwr,
|
||||
combined_partition, sclk, mclk,
|
||||
fan, str(perf).lower(), pwrCap,
|
||||
mem_use_pct, gpu_busy]
|
||||
val_widths = {}
|
||||
for device in deviceList:
|
||||
@@ -1589,6 +1640,9 @@ def showAllConcise(deviceList):
|
||||
for col in range(len(val_widths[device])):
|
||||
max_widths[col] = max(max_widths[col], val_widths[device][col])
|
||||
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None)
|
||||
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
|
||||
None, useItalics=True)
|
||||
printLogSpacer(fill='=')
|
||||
for device in deviceList:
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
|
||||
@@ -2548,7 +2602,7 @@ def showEvents(deviceList, eventTypes):
|
||||
break
|
||||
|
||||
|
||||
def printTempGraph(deviceList, delay):
|
||||
def printTempGraph(deviceList, delay, temp_type):
|
||||
# deviceList must be in ascending order
|
||||
deviceList.sort()
|
||||
devices = 0
|
||||
@@ -2562,7 +2616,7 @@ def printTempGraph(deviceList, delay):
|
||||
terminalWidth = os.get_terminal_size()[0]
|
||||
printStrings = list()
|
||||
for device in deviceList:
|
||||
temp = getTemp(device, 'edge')
|
||||
temp = getTemp(device, temp_type)
|
||||
if temp == 'N/A':
|
||||
percentage = 0
|
||||
else:
|
||||
@@ -2635,11 +2689,16 @@ def getGraphColor(percentage):
|
||||
|
||||
|
||||
def showTempGraph(deviceList):
|
||||
printLogSpacer(' Temperature Graph ')
|
||||
deviceList.sort()
|
||||
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
|
||||
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
|
||||
temp_type = temp_type.lower()
|
||||
temp_type = temp_type.replace('(', '')
|
||||
temp_type = temp_type.replace(')', '')
|
||||
# Start a thread for constantly printing
|
||||
try:
|
||||
# Create a thread (call print function, devices, delay in ms)
|
||||
_thread.start_new_thread(printTempGraph, (deviceList, 150))
|
||||
_thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type))
|
||||
except Exception as e:
|
||||
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
||||
# Catch user input for program termination
|
||||
|
||||
@@ -78,6 +78,7 @@
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
using namespace ROCmLogging;
|
||||
using namespace amd::smi;
|
||||
|
||||
static const uint32_t kMaxOverdriveLevel = 20;
|
||||
static const float kEnergyCounterResolution = 15.3f;
|
||||
@@ -2475,7 +2476,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
}
|
||||
|
||||
if (temperature == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: temperature was a null ptr reference"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
// The HBM temperature is retreived from the gpu_metrics
|
||||
@@ -2484,12 +2494,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
|| sensor_type == RSMI_TEMP_TYPE_HBM_2
|
||||
|| sensor_type == RSMI_TEMP_TYPE_HBM_3) {
|
||||
if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: To retreive HBM temp, we only support metric = "
|
||||
<< "RSMI_TEMP_CURRENT"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rsmi_gpu_metrics_t gpu_metrics;
|
||||
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
|
||||
<< getRSMIStatusString(ret)
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(ret) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2509,11 +2539,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
default:
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
if (val_ui16 == UINT16_MAX)
|
||||
if (val_ui16 == UINT16_MAX) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: Reached UINT16 max value, overflow"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
else
|
||||
} else
|
||||
*temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Data: " << *temperature
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | ";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
} // end HBM temperature
|
||||
|
||||
@@ -2522,6 +2569,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
GET_DEV_FROM_INDX
|
||||
|
||||
if (dev->monitor() == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: monitor returned nullptr"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
std::shared_ptr<amd::smi::Monitor> m = dev->monitor();
|
||||
@@ -2535,6 +2591,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index)
|
||||
|
||||
ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Sensor_index: " << sensor_index
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Data: " << *temperature
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(ret) << " | ";
|
||||
LOG_INFO(ss);
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
|
||||
@@ -170,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) {
|
||||
// computed for cardX.
|
||||
// On success, return drm_minor which is >= 128 otherwise return 0
|
||||
static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
std::ostringstream ss;
|
||||
std::string drm_path = s;
|
||||
int drm_minor = 0;
|
||||
const std::string render_file_prefix = "renderD";
|
||||
@@ -195,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
if (closedir(drm_dir)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = "
|
||||
<< std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return static_cast<uint32_t>(drm_minor);
|
||||
}
|
||||
|
||||
@@ -377,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
// Remove any drm nodes that don't have a corresponding readable kfd node.
|
||||
// kfd nodes will not be added if their properties file is not readable.
|
||||
std::ostringstream ss;
|
||||
auto dev_iter = devices_.begin();
|
||||
while (dev_iter != devices_.end()) {
|
||||
uint64_t bdfid = (*dev_iter)->bdfid();
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | removing device = "
|
||||
<< (*dev_iter)->path();
|
||||
dev_iter = devices_.erase(dev_iter);
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
dev_iter++;
|
||||
@@ -411,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
}
|
||||
// Leaving below to help debug temp file issues
|
||||
// displayAppTmpFilesContent();
|
||||
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
|
||||
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -646,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) {
|
||||
}
|
||||
void
|
||||
RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
auto dev_path = std::string(kPathDRMRoot);
|
||||
dev_path += "/";
|
||||
dev_path += dev_name;
|
||||
@@ -662,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
|
||||
|
||||
devices_.push_back(dev);
|
||||
ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = "
|
||||
<< dev_name << " | path = " << dev_path
|
||||
<< " | card index = " << std::to_string(card_indx) << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -669,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
static const uint32_t kAmdGpuId = 0x1002;
|
||||
|
||||
static bool isAMDGPU(std::string dev_path) {
|
||||
bool isAmdGpu = false;
|
||||
std::ostringstream ss;
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
if (!FileExists(vend_path.c_str())) {
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
std::ifstream fs;
|
||||
fs.open(vend_path);
|
||||
|
||||
if (!fs.is_open()) {
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
uint32_t vendor_id;
|
||||
@@ -688,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
fs.close();
|
||||
|
||||
if (vendor_id == kAmdGpuId) {
|
||||
return true;
|
||||
isAmdGpu = true;
|
||||
}
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
|
||||
@@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
// This string version should work for all valid monitor types
|
||||
int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
std::string *val) {
|
||||
std::ostringstream ss;
|
||||
assert(val != nullptr);
|
||||
|
||||
std::string temp_str;
|
||||
@@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
|
||||
DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr)
|
||||
int ret = ReadSysfsStr(sysfs_path, val);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Success | Read hwmon file: " << sysfs_path
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Sensor id: " << std::to_string(sensor_id)
|
||||
<< " | Data: " << *val
|
||||
<< " | Returning: " << std::to_string(ret) << " |";
|
||||
LOG_INFO(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int32_t
|
||||
Monitor::setTempSensorLabelMap(void) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
std::string type_str;
|
||||
int ret;
|
||||
|
||||
|
||||
@@ -204,9 +204,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
|
||||
if (!fs.is_open()) {
|
||||
ret = errno;
|
||||
errno = 0;
|
||||
oss << "Could not read SYSFS file (" << path << ")"
|
||||
<< ", returning " << std::to_string(ret) << " ("
|
||||
<< std::strerror(ret) << ")";
|
||||
oss << __PRETTY_FUNCTION__
|
||||
<< " | Fail | Cause: file does not exist or permissions issue"
|
||||
<< " | SYSFS file: " << path
|
||||
<< " | Returning: " << std::strerror(ret) << " |";
|
||||
LOG_ERROR(oss);
|
||||
return ret;
|
||||
}
|
||||
@@ -516,19 +517,39 @@ void displayAppTmpFilesContent() {
|
||||
}
|
||||
|
||||
// Used to debug vector string list and their content
|
||||
void displayVectorContent(std::vector<std::string> v) {
|
||||
std::cout << "Vector = {";
|
||||
std::string debugVectorContent(std::vector<std::string> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
std::cout << *it;
|
||||
ss << *it;
|
||||
auto temp_it = it;
|
||||
if(++temp_it != v.end()) {
|
||||
std::cout << ", ";
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
ss << "}" << std::endl;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Used to debug vector string list and their content
|
||||
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
ss << (*it)->path();
|
||||
auto temp_it = it;
|
||||
if(++temp_it != v.end()) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
}
|
||||
ss << "}" << std::endl;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Attempts to read application specific temporary file
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur