Revert "Revert "Merge amd-staging into amd-master 20230602""
This reverts commitdae9c3c9aa. Signed-off-by: Hao Zhou <Hao.Zhou@amd.com> Change-Id: I38b7d0ca4535503bf0b9ba491de0eb747f3dd966 [ROCm/rocm_smi_lib commit:255b4d122b]
This commit is contained in:
@@ -23,6 +23,9 @@ set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}")
|
||||
set(ROCM_SMI_TARGET "${ROCM_SMI}64")
|
||||
set(ROCM_SMI_LIB_NAME "lib${ROCM_SMI_TARGET}")
|
||||
|
||||
# Expose project info to IDEs
|
||||
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
|
||||
|
||||
# provide git to utilities
|
||||
find_program (GIT NAMES git)
|
||||
|
||||
@@ -55,9 +58,6 @@ project(${AMD_SMI_LIBS_TARGET})
|
||||
include(GNUInstallDirs)
|
||||
set(COMMON_PROJ_ROOT ${PROJECT_SOURCE_DIR})
|
||||
|
||||
## Verbose output.
|
||||
set(CMAKE_VERBOSE_MAKEFILE on)
|
||||
|
||||
if (CMAKE_COMPILER_IS_GNUCC AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0)
|
||||
message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
@@ -35,9 +35,14 @@ do_configureLogrotate() {
|
||||
size 1M
|
||||
copytruncate
|
||||
dateext
|
||||
dateformat .%Y-%m-%d_%H:%M:%S
|
||||
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
|
||||
}
|
||||
EOF
|
||||
# Fix for %S argument not found (now we escape with %%)
|
||||
# issue was RPM build thought we were using macros
|
||||
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
|
||||
# https://rpm-software-management.github.io/rpm/manual/spec.html
|
||||
sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf
|
||||
# workaround: remove extra 'OURCE' text
|
||||
# from rocm_smi.conf. Unsure if CMAKE,
|
||||
# bash, or here document
|
||||
|
||||
@@ -34,9 +34,14 @@ do_configureLogrotate() {
|
||||
size 1M
|
||||
copytruncate
|
||||
dateext
|
||||
dateformat .%Y-%m-%d_%H:%M:%S
|
||||
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
|
||||
}
|
||||
EOF
|
||||
# Fix for %S argument not found (now we escape with %%)
|
||||
# issue was RPM build thought we were using macros
|
||||
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
|
||||
# https://rpm-software-management.github.io/rpm/manual/spec.html
|
||||
sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf
|
||||
# workaround: remove extra 'OURCE' text
|
||||
# from rocm_smi.conf. Unsure if CMAKE,
|
||||
# bash, or here document
|
||||
|
||||
@@ -326,7 +326,7 @@ def getPerfLevel(device):
|
||||
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
|
||||
if rsmi_ret_ok(ret, device, 'get_perf_level'):
|
||||
return perf_level_string(perf.value)
|
||||
return -1
|
||||
return 'N/A'
|
||||
|
||||
|
||||
def getPid(name):
|
||||
@@ -540,8 +540,7 @@ def printEventList(device, delay, eventList):
|
||||
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
|
||||
data.message.decode('utf8') + '\r']])
|
||||
|
||||
|
||||
def printLog(device, metricName, value, extraSpace=False):
|
||||
def printLog(device, metricName, value=None, extraSpace=False):
|
||||
""" Print out to the SMI log
|
||||
|
||||
@param device: DRM device identifier
|
||||
@@ -900,7 +899,7 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
|
||||
try:
|
||||
int(minvalue) & int(maxvalue)
|
||||
except ValueError:
|
||||
printErrLog(device, 'Unable to set %s range' % (clkType))
|
||||
printErrLog(None, 'Unable to set %s range' % (clkType))
|
||||
logging.error('%s or %s is not an integer', minvalue, maxvalue)
|
||||
RETCODE = 1
|
||||
return
|
||||
@@ -1076,7 +1075,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
int(check_value)
|
||||
except ValueError:
|
||||
printLog(None, 'Unable to set clock level', None)
|
||||
logging.error('Non-integer characters are present in value %s', value)
|
||||
logging.error('Non-integer characters are present in value %s', check_value)
|
||||
RETCODE = 1
|
||||
return
|
||||
# Generate a frequency bitmask from user input value
|
||||
@@ -1155,7 +1154,7 @@ def setPerfDeterminism(deviceList, clkvalue):
|
||||
try:
|
||||
int(clkvalue)
|
||||
except ValueError:
|
||||
printErrLog(device, 'Unable to set Performance Determinism')
|
||||
printErrLog(None, 'Unable to set Performance Determinism')
|
||||
logging.error('%s is not an integer', clkvalue)
|
||||
RETCODE = 1
|
||||
return
|
||||
@@ -1225,19 +1224,16 @@ def setRas(deviceList, rasAction, rasBlock, rasType):
|
||||
printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios")
|
||||
|
||||
if rasAction not in validRasActions:
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
|
||||
None)
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
||||
logging.debug('Action %s is not a valid RAS command' % rasAction)
|
||||
return
|
||||
if rasBlock not in validRasBlocks:
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
|
||||
None)
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
||||
printLog(None, 'Block %s is not a valid RAS block' % rasBlock)
|
||||
return
|
||||
|
||||
if rasType not in validRasTypes:
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
|
||||
None)
|
||||
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
||||
printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction)
|
||||
return
|
||||
|
||||
@@ -1277,7 +1273,6 @@ def setFanSpeed(deviceList, fan):
|
||||
for device in deviceList:
|
||||
if str(fan):
|
||||
fanLevel = c_int64()
|
||||
sensor_ind = c_uint32(0)
|
||||
last_char = str(fan)[-1]
|
||||
if last_char == '%':
|
||||
fanLevel = int(str(fan)[:-1]) / 100 * 255
|
||||
@@ -1675,7 +1670,6 @@ def showClocks(deviceList):
|
||||
printLogSpacer(' Supported clock frequencies ')
|
||||
for device in deviceList:
|
||||
for clk_type in sorted(rsmi_clk_names_dict):
|
||||
freq_list = []
|
||||
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
|
||||
@@ -1694,7 +1688,6 @@ def showClocks(deviceList):
|
||||
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
||||
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
|
||||
printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None)
|
||||
freq_list = []
|
||||
for x in range(bw.transfer_rate.num_supported):
|
||||
fr = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[x] / 1000000000, bw.lanes[x])
|
||||
if x == bw.transfer_rate.current:
|
||||
@@ -1718,8 +1711,6 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
global PRINT_JSON
|
||||
freq = rsmi_frequencies_t()
|
||||
bw = rsmi_pcie_bandwidth_t()
|
||||
currentString = ''
|
||||
sortedClocksArray = []
|
||||
if not concise:
|
||||
printLogSpacer(' Current clock frequencies ')
|
||||
for device in deviceList:
|
||||
@@ -1832,7 +1823,6 @@ def showFwInfo(deviceList, fwType):
|
||||
firmware_blocks = fwType
|
||||
printLogSpacer(' Firmware Information ')
|
||||
for device in deviceList:
|
||||
fw_ver_list = []
|
||||
fw_ver = c_uint64()
|
||||
for fw_name in firmware_blocks:
|
||||
fw_name = fw_name.upper()
|
||||
@@ -2054,10 +2044,13 @@ def showMemVendor(deviceList):
|
||||
printLogSpacer(' Memory Vendor ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
|
||||
printLog(device, 'GPU memory vendor', vendor.value.decode())
|
||||
else:
|
||||
logging.debug('GPU memory vendor missing or not supported')
|
||||
try:
|
||||
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
|
||||
printLog(device, 'GPU memory vendor', vendor.value.decode())
|
||||
else:
|
||||
logging.debug('GPU memory vendor missing or not supported')
|
||||
except UnicodeDecodeError:
|
||||
printErrLog(device, 'Unable to read GPU memory vendor')
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -2070,6 +2063,8 @@ def showOverDrive(deviceList, odtype):
|
||||
rsmi_od = c_uint32()
|
||||
printLogSpacer(' OverDrive Level ')
|
||||
for device in deviceList:
|
||||
odStr = ''
|
||||
od = ''
|
||||
if odtype == 'sclk':
|
||||
odStr = 'GPU'
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od))
|
||||
@@ -2085,7 +2080,6 @@ def showOverDrive(deviceList, odtype):
|
||||
else:
|
||||
printErrLog(device, 'Unable to retrieve OverDrive')
|
||||
logging.error('Unsupported clock type %s', odtype)
|
||||
RETCODE = 1
|
||||
printLog(device, odStr + ' OverDrive value (%)', od)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -2380,7 +2374,7 @@ def showRasInfo(deviceList, rasType):
|
||||
for name in rasType:
|
||||
if name.upper() not in rsmi_gpu_block_d:
|
||||
rasType.remove(name)
|
||||
printErrLog(device, '%s is not a RAS block' % (name))
|
||||
printErrLog(None, '%s is not a RAS block' % (name))
|
||||
|
||||
rasBlocks = [block.upper() for block in rasType]
|
||||
|
||||
@@ -2522,15 +2516,15 @@ def showEvents(deviceList, eventTypes):
|
||||
printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', '')))
|
||||
if len(eventTypeList) == 0:
|
||||
eventTypeList = notification_type_names
|
||||
try:
|
||||
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
|
||||
# Create a seperate thread for each GPU
|
||||
for device in deviceList:
|
||||
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
|
||||
time.sleep(0.25)
|
||||
except Exception as e:
|
||||
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
||||
return
|
||||
try:
|
||||
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
|
||||
time.sleep(0.25)
|
||||
except Exception as e:
|
||||
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
||||
return
|
||||
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
||||
getch = _Getch()
|
||||
user_input = getch()
|
||||
@@ -2555,16 +2549,19 @@ def printTempGraph(deviceList, delay):
|
||||
printEmptyLine()
|
||||
originalTerminalWidth = os.get_terminal_size()[0]
|
||||
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
||||
printString = ''
|
||||
terminalWidth = os.get_terminal_size()[0]
|
||||
printStrings = list()
|
||||
for device in deviceList:
|
||||
temp = getTemp(device, 'edge')
|
||||
percentage = temp
|
||||
if temp == 'N/A':
|
||||
percentage = 0
|
||||
else:
|
||||
percentage = temp
|
||||
if percentage >= 100:
|
||||
percentage = 100
|
||||
if percentage < 0:
|
||||
percentage = 0
|
||||
# Get available space based on terminal width
|
||||
terminalWidth = os.get_terminal_size()[0]
|
||||
availableSpace = 0
|
||||
if terminalWidth >= 20:
|
||||
availableSpace = terminalWidth - 20
|
||||
@@ -2587,13 +2584,17 @@ def printTempGraph(deviceList, delay):
|
||||
return
|
||||
# Two spare Spaces
|
||||
tempString = (str(int(temp)) + '°C').ljust(5)
|
||||
printString += '\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s\r\n' % (device, tempString, color, paddingSpace[1:], remainderSpace)
|
||||
printStrings.append('\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s' % (device, tempString, color, paddingSpace[1:], remainderSpace))
|
||||
originalTerminalWidth = terminalWidth
|
||||
time.sleep((delay / 1000))
|
||||
|
||||
if terminalWidth >= 20:
|
||||
for i in range(devices):
|
||||
printString = '\033[A' + printString
|
||||
print(printString, end = '\r')
|
||||
# go up and prepare to rewrite the lines
|
||||
for i in printStrings:
|
||||
print('\033[A', end='\r')
|
||||
# print all strings
|
||||
for i in printStrings:
|
||||
print(i, end='\r\n')
|
||||
|
||||
|
||||
def getGraphColor(percentage):
|
||||
@@ -2966,7 +2967,7 @@ def showNodesBw(deviceList):
|
||||
else:
|
||||
gpu_links_type[srcdevice][destdevice] = "N/A"
|
||||
if PRINT_JSON:
|
||||
formatMatrixToJSON(deviceList, "{}-{}".format(minBW.value, maxBW.value), " min-max bandwidth between DRM devices {} and {}".format(srcdevice, destdevice))
|
||||
# TODO
|
||||
return
|
||||
printTableRow(None, ' ')
|
||||
for row in deviceList:
|
||||
@@ -3219,7 +3220,10 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
||||
returnString += '%s GPU[%s]:' % (my_ret, device)
|
||||
if metric is not None:
|
||||
returnString += ' %s: ' % (metric)
|
||||
returnString += '%s\t' % (err_str.value.decode())
|
||||
else:
|
||||
metric = ''
|
||||
if err_str.value is not None:
|
||||
returnString += '%s\t' % (err_str.value.decode())
|
||||
if not PRINT_JSON:
|
||||
logging.debug('%s', returnString)
|
||||
if not silent:
|
||||
@@ -3279,7 +3283,7 @@ def save(deviceList, savefilepath):
|
||||
'fan': fanSpeeds[device], 'overdrivesclk': overDriveGpu[device],
|
||||
'overdrivemclk': overDriveGpuMem[device], 'profile': profiles[device],
|
||||
'perflevel': perfLevels[device]}
|
||||
printLog(device, 'Current settings successfully saved to', savefilepath)
|
||||
printLog(None, 'Current settings successfully saved to', savefilepath)
|
||||
with open(savefilepath, 'w') as savefile:
|
||||
json.dump(jsonData, savefile, ensure_ascii=True)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -2237,16 +2237,79 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) {
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) {
|
||||
rsmi_status_t ret;
|
||||
TRY
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
CHK_SUPPORT_NAME_ONLY(b)
|
||||
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
CHK_API_SUPPORT_ONLY((b), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT)
|
||||
DEVICE_MUTEX
|
||||
|
||||
return get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind,
|
||||
ret = get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind,
|
||||
&b->transfer_rate, b->lanes);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Only fallback to gpu_metric if connecting via PCIe
|
||||
if (kfd_node->numa_node_type() != amd::smi::IOLINK_TYPE_PCIEXPRESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
rsmi_gpu_metrics_t gpu_metrics;
|
||||
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Hardcode based on PCIe specification: https://en.wikipedia.org/wiki/PCI_Express
|
||||
const uint32_t link_width[] = {1, 2, 4, 8, 12, 16};
|
||||
const uint32_t link_speed[] = {25, 50, 80, 160}; // 0.1 Ghz
|
||||
const uint32_t WIDTH_DATA_LENGTH = sizeof(link_width)/sizeof(uint32_t);
|
||||
const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t);
|
||||
|
||||
// Calculate the index
|
||||
int width_index = -1;
|
||||
int speed_index = -1;
|
||||
uint32_t cur_index = 0;
|
||||
for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) {
|
||||
if (link_width[cur_index] == gpu_metrics.pcie_link_width) {
|
||||
width_index = cur_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (cur_index = 0;
|
||||
cur_index < SPEED_DATA_LENGTH; cur_index++) {
|
||||
if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) {
|
||||
speed_index = cur_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (width_index == -1 || speed_index == -1) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
// Set possible lanes and frequencies
|
||||
b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH;
|
||||
b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index;
|
||||
for (cur_index = 0;
|
||||
cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) {
|
||||
b->transfer_rate.frequency[cur_index]
|
||||
= link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L;
|
||||
b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH];
|
||||
}
|
||||
/*
|
||||
frequency = {2500, 2500, 2500, 2500, 2500, 2500,
|
||||
5000, 5000, 5000, 5000, 5000, 5000,
|
||||
8000, 8000, 8000, 8000, 8000, 8000,
|
||||
16000, 16000, 16000, 16000, 16000, 16000}; // Mhz
|
||||
lanes = {1, 2, 4, 8, 12, 16,
|
||||
1, 2, 4, 8, 12, 16,
|
||||
1, 2, 4, 8, 12, 16,
|
||||
1, 2, 4, 8, 12, 16 }; // For each frequency
|
||||
*/
|
||||
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -50,6 +50,7 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user