Merge amd-staging into amd-master 20230830
Signed-off-by: Hao Zhou <Hao.Zhou@amd.com> Change-Id: I41c0e3ec76a43af30f25e140f25b521a6097d125
This commit is contained in:
@@ -1534,7 +1534,7 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*/
|
||||
rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node);
|
||||
rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node);
|
||||
|
||||
/**
|
||||
* @brief Get PCIe traffic information
|
||||
|
||||
+127
-27
@@ -205,16 +205,31 @@ def getFanSpeed(device):
|
||||
fl = 0
|
||||
fm = 0
|
||||
|
||||
""" If ret = 2; (No such file or directory)
|
||||
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
||||
"""
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_speed', True):
|
||||
fl = fanLevel.value
|
||||
last_ret = ret
|
||||
|
||||
""" If ret = 2; (No such file or directory)
|
||||
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
||||
"""
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True):
|
||||
fm = fanMax.value
|
||||
if fl == 0 or fm == 0:
|
||||
return (fl, 0) # to prevent division by zero crash
|
||||
|
||||
return (fl, round((float(fl) / float(fm)) * 100, 2))
|
||||
""" In case we had an error before, we don't overwrite it with a
|
||||
possible success now. Otherwise, we get the next error.
|
||||
"""
|
||||
if (last_ret == rsmi_status_t.RSMI_STATUS_SUCCESS):
|
||||
last_ret = ret
|
||||
|
||||
if fl == 0 or fm == 0:
|
||||
return (last_ret, fl, 0) # to prevent division by zero crash
|
||||
|
||||
return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2))
|
||||
|
||||
|
||||
def getGpuUse(device):
|
||||
@@ -789,8 +804,12 @@ def resetFans(deviceList):
|
||||
for device in deviceList:
|
||||
sensor_ind = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
|
||||
if rsmi_ret_ok(ret, device, 'reset_fan'):
|
||||
printLog(device, 'Successfully reset fan speed to driver control', None)
|
||||
if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION):
|
||||
if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'):
|
||||
continue
|
||||
else:
|
||||
if rsmi_ret_ok(ret, device, 'reset_fan'):
|
||||
printLog(device, 'Successfully reset fan speed to driver control', None)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -1586,18 +1605,24 @@ def showAllConcise(deviceList):
|
||||
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
"""
|
||||
global PRINT_JSON
|
||||
global PRINT_JSON, appWidth
|
||||
if PRINT_JSON:
|
||||
print('ERROR: Cannot print JSON/CSV output for concise output')
|
||||
sys.exit(1)
|
||||
|
||||
""" Place holder for the actual max size """
|
||||
MAX_ALL_CONCISE_WIDTH = 100
|
||||
appWidth_temp = appWidth
|
||||
appWidth = MAX_ALL_CONCISE_WIDTH
|
||||
|
||||
printLogSpacer(' Concise Info ')
|
||||
deviceList.sort()
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
available_temp_type = temp_type.lower()
|
||||
available_temp_type = available_temp_type.replace('(', '')
|
||||
available_temp_type = available_temp_type.replace(')', '')
|
||||
header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
|
||||
header = ['GPU', '[Model : Revision]', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
subheader = ['', 'Name (20 chars)', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
|
||||
# add additional spaces to match header
|
||||
for idx, item in enumerate(subheader):
|
||||
header_size = len(header[idx])
|
||||
@@ -1614,6 +1639,8 @@ def showAllConcise(deviceList):
|
||||
values = {}
|
||||
degree_sign = u'\N{DEGREE SIGN}'
|
||||
for device in deviceList:
|
||||
gpu_dev_product_info = getDevProductInfo(device)
|
||||
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
|
||||
temp_val = str(getTemp(device, available_temp_type))
|
||||
if temp_val != 'N/A':
|
||||
temp_val += degree_sign + 'C'
|
||||
@@ -1627,7 +1654,7 @@ def showAllConcise(deviceList):
|
||||
concise = True
|
||||
sclk = showCurrentClocks([device], 'sclk', concise)
|
||||
mclk = showCurrentClocks([device], 'mclk', concise)
|
||||
(fanLevel, fanSpeed) = getFanSpeed(device)
|
||||
(retCode, fanLevel, fanSpeed) = getFanSpeed(device)
|
||||
fan = str(fanSpeed) + '%'
|
||||
if getPerfLevel(device) != -1:
|
||||
perf = getPerfLevel(device)
|
||||
@@ -1647,10 +1674,19 @@ def showAllConcise(deviceList):
|
||||
mem_use_pct='Unsupported'
|
||||
if vram_used != None and vram_total != None and float(vram_total) != 0:
|
||||
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
|
||||
values['card%s' % (str(device))] = [device, temp_val, avgPwr,
|
||||
|
||||
gpu_dev_product_info_top_name = gpu_dev_product_info_names[0]
|
||||
if (len(gpu_dev_product_info_names) > 1):
|
||||
values['card%s_Info' % (str(device))] = ['', gpu_dev_product_info_names[0], '', '', '',
|
||||
'', '', '',
|
||||
'', '', '', '']
|
||||
gpu_dev_product_info_top_name = gpu_dev_product_info_names[1]
|
||||
|
||||
values['card%s' % (str(device))] = [device, gpu_dev_product_info_top_name, temp_val, avgPwr,
|
||||
combined_partition, sclk, mclk,
|
||||
fan, str(perf).lower(), pwrCap,
|
||||
mem_use_pct, gpu_busy]
|
||||
|
||||
val_widths = {}
|
||||
for device in deviceList:
|
||||
val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]]
|
||||
@@ -1662,10 +1698,19 @@ def showAllConcise(deviceList):
|
||||
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
|
||||
None, useItalics=True)
|
||||
printLogSpacer(fill='=')
|
||||
|
||||
for device in deviceList:
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
|
||||
gpu_dev_product_info = getDevProductInfo(device)
|
||||
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
|
||||
if (len(gpu_dev_product_info_names) > 1):
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values['card%s_Info' % (str(device))])), None)
|
||||
|
||||
printLogSpacer()
|
||||
""" Restore original max size """
|
||||
appWidth = appWidth_temp
|
||||
|
||||
|
||||
def showAllConciseHw(deviceList):
|
||||
@@ -1835,22 +1880,25 @@ def showCurrentFans(deviceList):
|
||||
sensor_ind = c_uint32(0)
|
||||
|
||||
for device in deviceList:
|
||||
(fanLevel, fanSpeed) = getFanSpeed(device)
|
||||
fanSpeed = round(fanSpeed)
|
||||
if fanLevel == 0 or fanSpeed == 0:
|
||||
printLog(device, 'Unable to detect fan speed for GPU %d' % (device), None)
|
||||
logging.debug('Current fan speed is: %d\n' % (fanSpeed) + \
|
||||
' Current fan level is: %d\n' % (fanLevel) + \
|
||||
' (GPU might be cooled with a non-PWM fan)')
|
||||
continue
|
||||
if PRINT_JSON:
|
||||
printLog(device, 'Fan speed (level)', str(fanLevel))
|
||||
printLog(device, 'Fan speed (%)', str(fanSpeed))
|
||||
(retCode, fanLevel, fanSpeed) = getFanSpeed(device)
|
||||
if (retCode == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED):
|
||||
printLog(device, 'Not supported', None)
|
||||
else:
|
||||
printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed)))
|
||||
ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_rpms'):
|
||||
printLog(device, 'Fan RPM', rpmSpeed.value)
|
||||
fanSpeed = round(fanSpeed)
|
||||
if fanLevel == 0 or fanSpeed == 0:
|
||||
printLog(device, 'Unable to detect fan speed for GPU %d' % (device), None)
|
||||
logging.debug('Current fan speed is: %d\n' % (fanSpeed) + \
|
||||
' Current fan level is: %d\n' % (fanLevel) + \
|
||||
' (GPU might be cooled with a non-PWM fan)')
|
||||
continue
|
||||
if PRINT_JSON:
|
||||
printLog(device, 'Fan speed (level)', str(fanLevel))
|
||||
printLog(device, 'Fan speed (%)', str(fanSpeed))
|
||||
else:
|
||||
printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed)))
|
||||
ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_rpms'):
|
||||
printLog(device, 'Fan RPM', rpmSpeed.value)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -2375,6 +2423,58 @@ def showProductName(deviceList):
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def getDevProductInfo(device):
|
||||
""" Show the requested product name for the device requested
|
||||
|
||||
@param device: Device we want to get the info for
|
||||
"""
|
||||
|
||||
# Retrieve card vendor
|
||||
MAX_BUFF_SIZE = 256
|
||||
MAX_DESC_SIZE = 20
|
||||
device_info = "N/A"
|
||||
device_list = {}
|
||||
vendor = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE)
|
||||
# Only continue if GPU vendor is AMD
|
||||
if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device):
|
||||
# Retrieve the device series
|
||||
series = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE)
|
||||
if rsmi_ret_ok(ret, device, 'get_name'):
|
||||
try:
|
||||
device_series = series.value.decode()
|
||||
except UnicodeDecodeError:
|
||||
device_series = "N/A"
|
||||
printErrLog(device, "Unable to read card series")
|
||||
|
||||
# Retrieve the device model
|
||||
model = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE)
|
||||
if rsmi_ret_ok(ret, device, 'get_subsystem_name'):
|
||||
try:
|
||||
device_model = model.value.decode()
|
||||
device_model = padHexValue(device_model, 4)
|
||||
except UnicodeDecodeError:
|
||||
device_model = "N/A"
|
||||
printErrLog(device, "Unable to read device model")
|
||||
|
||||
try:
|
||||
gpu_revision = padHexValue(getRev(device), 2)
|
||||
except Exception as exc:
|
||||
gpu_revision = "N/A"
|
||||
printErrLog(device, "Unable to read card revision %s" % (exc))
|
||||
|
||||
device_series_str = str(device_series[:MAX_DESC_SIZE])
|
||||
device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ')
|
||||
device_model_str = str(('[' + device_model + ' : ' + gpu_revision + ']'))
|
||||
device_model_str = str(device_model_str[:MAX_DESC_SIZE])
|
||||
device_model_str = device_model_str.ljust(MAX_DESC_SIZE, ' ')
|
||||
device_list = {device : [device_series_str, device_model_str]}
|
||||
|
||||
return device_list
|
||||
|
||||
|
||||
def showProfile(deviceList):
|
||||
""" Display available Power Profiles for a list of devices.
|
||||
|
||||
@@ -2996,7 +3096,7 @@ def showNumaTopology(deviceList):
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
"""
|
||||
printLogSpacer(' Numa Nodes ')
|
||||
numa_numbers = c_uint32()
|
||||
numa_numbers = c_int32()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers))
|
||||
if rsmi_ret_ok(ret, device, 'get_numa_node_number'):
|
||||
@@ -3356,7 +3456,7 @@ def save(deviceList, savefilepath):
|
||||
clocks[device][clk_type] = str(freq.current)
|
||||
else:
|
||||
clocks[device][clk_type] = '0'
|
||||
fanSpeeds[device] = getFanSpeed(device)[0]
|
||||
fanSpeeds[device] = getFanSpeed(device)[1]
|
||||
od = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(od))
|
||||
if rsmi_ret_ok(ret, device, 'get_overdrive_level'):
|
||||
|
||||
+4
-3
@@ -756,7 +756,7 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node) {
|
||||
rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) {
|
||||
TRY
|
||||
rsmi_status_t ret;
|
||||
uint64_t val = 0;
|
||||
@@ -764,9 +764,10 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node) {
|
||||
CHK_SUPPORT_NAME_ONLY(numa_node)
|
||||
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(amd::smi::kDevNumaNode, dv_ind, &val);
|
||||
std::string str_val;
|
||||
ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val);
|
||||
*numa_node = std::stol(str_val, 0);
|
||||
|
||||
*numa_node = static_cast<uint32_t>(val);
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -928,7 +928,6 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevDFCountersAvailable:
|
||||
case kDevMemBusyPercent:
|
||||
case kDevXGMIError:
|
||||
case kDevNumaNode:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
if (tempStr == "") {
|
||||
@@ -1046,6 +1045,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
|
||||
case kDevAvailableComputePartition:
|
||||
case kDevComputePartition:
|
||||
case kDevMemoryPartition:
|
||||
case kDevNumaNode:
|
||||
return readDevInfoStr(type, val);
|
||||
break;
|
||||
|
||||
|
||||
@@ -90,6 +90,7 @@ void TestSysInfoRead::Run(void) {
|
||||
rsmi_status_t err;
|
||||
uint64_t val_ui64;
|
||||
uint32_t val_ui32;
|
||||
int32_t val_i32;
|
||||
char buffer[80];
|
||||
rsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
|
||||
|
||||
@@ -136,11 +137,11 @@ void TestSysInfoRead::Run(void) {
|
||||
err = rsmi_dev_pci_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
|
||||
err = rsmi_topo_numa_affinity_get(i, &val_ui32);
|
||||
err = rsmi_topo_numa_affinity_get(i, &val_i32);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**NUMA NODE: 0x" << std::hex << val_ui32;
|
||||
std::cout << " (" << std::dec << val_ui32 << ")" << std::endl;
|
||||
std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32;
|
||||
std::cout << " (" << std::dec << val_i32 << ")" << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_topo_numa_affinity_get(i, nullptr);
|
||||
|
||||
@@ -57,18 +57,19 @@ $BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadWrite.TestPerfLevelReadWrite"
|
||||
|
||||
# SWDEV-391407
|
||||
# aqua_vanjaram and later systems show 'ip discovery' in
|
||||
# /sys/class/kfd/kfd/topology/nodes/*/name
|
||||
#
|
||||
# For those systems gfx_target_version must be used. It can be found in
|
||||
# /sys/class/kfd/kfd/topology/nodes/*/properties
|
||||
FILTER[90400]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadOnly.TestVoltCurvRead:"\
|
||||
"rsmitstReadOnly.TestFrequenciesRead:"\
|
||||
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite"
|
||||
FILTER[90401]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadOnly.TestVoltCurvRead:"\
|
||||
"rsmitstReadOnly.TestFrequenciesRead:"\
|
||||
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite"
|
||||
FILTER[90401]=${FILTER[90400]}
|
||||
FILTER[90402]=${FILTER[90400]}
|
||||
|
||||
# SWDEV-321166
|
||||
FILTER[virtualization]=\
|
||||
|
||||
Reference in New Issue
Block a user