Revert "Revert "Merge amd-staging into amd-master 20230602""

This reverts commit dae9c3c9aa.

Signed-off-by: Hao Zhou <Hao.Zhou@amd.com>
Change-Id: I38b7d0ca4535503bf0b9ba491de0eb747f3dd966


[ROCm/rocm_smi_lib commit: 255b4d122b]
This commit is contained in:
Hao Zhou
2023-06-07 11:56:29 +08:00
parent 8e890b0258
commit afe023e09c
6 ha cambiato i file con 126 aggiunte e 48 eliminazioni
+3 -3
Vedi File
@@ -23,6 +23,9 @@ set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}")
set(ROCM_SMI_TARGET "${ROCM_SMI}64")
set(ROCM_SMI_LIB_NAME "lib${ROCM_SMI_TARGET}")
# Expose project info to IDEs
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
# provide git to utilities
find_program (GIT NAMES git)
@@ -55,9 +58,6 @@ project(${AMD_SMI_LIBS_TARGET})
include(GNUInstallDirs)
set(COMMON_PROJ_ROOT ${PROJECT_SOURCE_DIR})
## Verbose output.
set(CMAKE_VERBOSE_MAKEFILE on)
if (CMAKE_COMPILER_IS_GNUCC AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0)
message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION})
+6 -1
Vedi File
@@ -35,9 +35,14 @@ do_configureLogrotate() {
size 1M
copytruncate
dateext
dateformat .%Y-%m-%d_%H:%M:%S
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
}
EOF
# Fix for %S argument not found (now we escape with %%)
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
+6 -1
Vedi File
@@ -34,9 +34,14 @@ do_configureLogrotate() {
size 1M
copytruncate
dateext
dateformat .%Y-%m-%d_%H:%M:%S
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
}
EOF
# Fix for %S argument not found (now we escape with %%)
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
@@ -326,7 +326,7 @@ def getPerfLevel(device):
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
if rsmi_ret_ok(ret, device, 'get_perf_level'):
return perf_level_string(perf.value)
return -1
return 'N/A'
def getPid(name):
@@ -540,8 +540,7 @@ def printEventList(device, delay, eventList):
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value, extraSpace=False):
def printLog(device, metricName, value=None, extraSpace=False):
""" Print out to the SMI log
@param device: DRM device identifier
@@ -900,7 +899,7 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
try:
int(minvalue) & int(maxvalue)
except ValueError:
printErrLog(device, 'Unable to set %s range' % (clkType))
printErrLog(None, 'Unable to set %s range' % (clkType))
logging.error('%s or %s is not an integer', minvalue, maxvalue)
RETCODE = 1
return
@@ -1076,7 +1075,7 @@ def setClocks(deviceList, clktype, clk):
int(check_value)
except ValueError:
printLog(None, 'Unable to set clock level', None)
logging.error('Non-integer characters are present in value %s', value)
logging.error('Non-integer characters are present in value %s', check_value)
RETCODE = 1
return
# Generate a frequency bitmask from user input value
@@ -1155,7 +1154,7 @@ def setPerfDeterminism(deviceList, clkvalue):
try:
int(clkvalue)
except ValueError:
printErrLog(device, 'Unable to set Performance Determinism')
printErrLog(None, 'Unable to set Performance Determinism')
logging.error('%s is not an integer', clkvalue)
RETCODE = 1
return
@@ -1225,19 +1224,16 @@ def setRas(deviceList, rasAction, rasBlock, rasType):
printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios")
if rasAction not in validRasActions:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
logging.debug('Action %s is not a valid RAS command' % rasAction)
return
if rasBlock not in validRasBlocks:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
printLog(None, 'Block %s is not a valid RAS block' % rasBlock)
return
if rasType not in validRasTypes:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction)
return
@@ -1277,7 +1273,6 @@ def setFanSpeed(deviceList, fan):
for device in deviceList:
if str(fan):
fanLevel = c_int64()
sensor_ind = c_uint32(0)
last_char = str(fan)[-1]
if last_char == '%':
fanLevel = int(str(fan)[:-1]) / 100 * 255
@@ -1675,7 +1670,6 @@ def showClocks(deviceList):
printLogSpacer(' Supported clock frequencies ')
for device in deviceList:
for clk_type in sorted(rsmi_clk_names_dict):
freq_list = []
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
@@ -1694,7 +1688,6 @@ def showClocks(deviceList):
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None)
freq_list = []
for x in range(bw.transfer_rate.num_supported):
fr = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[x] / 1000000000, bw.lanes[x])
if x == bw.transfer_rate.current:
@@ -1718,8 +1711,6 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
global PRINT_JSON
freq = rsmi_frequencies_t()
bw = rsmi_pcie_bandwidth_t()
currentString = ''
sortedClocksArray = []
if not concise:
printLogSpacer(' Current clock frequencies ')
for device in deviceList:
@@ -1832,7 +1823,6 @@ def showFwInfo(deviceList, fwType):
firmware_blocks = fwType
printLogSpacer(' Firmware Information ')
for device in deviceList:
fw_ver_list = []
fw_ver = c_uint64()
for fw_name in firmware_blocks:
fw_name = fw_name.upper()
@@ -2054,10 +2044,13 @@ def showMemVendor(deviceList):
printLogSpacer(' Memory Vendor ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256)
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
printLog(device, 'GPU memory vendor', vendor.value.decode())
else:
logging.debug('GPU memory vendor missing or not supported')
try:
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
printLog(device, 'GPU memory vendor', vendor.value.decode())
else:
logging.debug('GPU memory vendor missing or not supported')
except UnicodeDecodeError:
printErrLog(device, 'Unable to read GPU memory vendor')
printLogSpacer()
@@ -2070,6 +2063,8 @@ def showOverDrive(deviceList, odtype):
rsmi_od = c_uint32()
printLogSpacer(' OverDrive Level ')
for device in deviceList:
odStr = ''
od = ''
if odtype == 'sclk':
odStr = 'GPU'
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od))
@@ -2085,7 +2080,6 @@ def showOverDrive(deviceList, odtype):
else:
printErrLog(device, 'Unable to retrieve OverDrive')
logging.error('Unsupported clock type %s', odtype)
RETCODE = 1
printLog(device, odStr + ' OverDrive value (%)', od)
printLogSpacer()
@@ -2380,7 +2374,7 @@ def showRasInfo(deviceList, rasType):
for name in rasType:
if name.upper() not in rsmi_gpu_block_d:
rasType.remove(name)
printErrLog(device, '%s is not a RAS block' % (name))
printErrLog(None, '%s is not a RAS block' % (name))
rasBlocks = [block.upper() for block in rasType]
@@ -2522,15 +2516,15 @@ def showEvents(deviceList, eventTypes):
printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', '')))
if len(eventTypeList) == 0:
eventTypeList = notification_type_names
try:
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
# Create a seperate thread for each GPU
for device in deviceList:
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
time.sleep(0.25)
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
return
try:
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
time.sleep(0.25)
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
getch = _Getch()
user_input = getch()
@@ -2555,16 +2549,19 @@ def printTempGraph(deviceList, delay):
printEmptyLine()
originalTerminalWidth = os.get_terminal_size()[0]
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
printString = ''
terminalWidth = os.get_terminal_size()[0]
printStrings = list()
for device in deviceList:
temp = getTemp(device, 'edge')
percentage = temp
if temp == 'N/A':
percentage = 0
else:
percentage = temp
if percentage >= 100:
percentage = 100
if percentage < 0:
percentage = 0
# Get available space based on terminal width
terminalWidth = os.get_terminal_size()[0]
availableSpace = 0
if terminalWidth >= 20:
availableSpace = terminalWidth - 20
@@ -2587,13 +2584,17 @@ def printTempGraph(deviceList, delay):
return
# Two spare Spaces
tempString = (str(int(temp)) + '°C').ljust(5)
printString += '\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s\r\n' % (device, tempString, color, paddingSpace[1:], remainderSpace)
printStrings.append('\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s' % (device, tempString, color, paddingSpace[1:], remainderSpace))
originalTerminalWidth = terminalWidth
time.sleep((delay / 1000))
if terminalWidth >= 20:
for i in range(devices):
printString = '\033[A' + printString
print(printString, end = '\r')
# go up and prepare to rewrite the lines
for i in printStrings:
print('\033[A', end='\r')
# print all strings
for i in printStrings:
print(i, end='\r\n')
def getGraphColor(percentage):
@@ -2966,7 +2967,7 @@ def showNodesBw(deviceList):
else:
gpu_links_type[srcdevice][destdevice] = "N/A"
if PRINT_JSON:
formatMatrixToJSON(deviceList, "{}-{}".format(minBW.value, maxBW.value), " min-max bandwidth between DRM devices {} and {}".format(srcdevice, destdevice))
# TODO
return
printTableRow(None, ' ')
for row in deviceList:
@@ -3219,7 +3220,10 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
returnString += '%s GPU[%s]:' % (my_ret, device)
if metric is not None:
returnString += ' %s: ' % (metric)
returnString += '%s\t' % (err_str.value.decode())
else:
metric = ''
if err_str.value is not None:
returnString += '%s\t' % (err_str.value.decode())
if not PRINT_JSON:
logging.debug('%s', returnString)
if not silent:
@@ -3279,7 +3283,7 @@ def save(deviceList, savefilepath):
'fan': fanSpeeds[device], 'overdrivesclk': overDriveGpu[device],
'overdrivemclk': overDriveGpuMem[device], 'profile': profiles[device],
'perflevel': perfLevels[device]}
printLog(device, 'Current settings successfully saved to', savefilepath)
printLog(None, 'Current settings successfully saved to', savefilepath)
with open(savefilepath, 'w') as savefile:
json.dump(jsonData, savefile, ensure_ascii=True)
printLogSpacer()
+66 -3
Vedi File
@@ -2237,16 +2237,79 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) {
rsmi_status_t
rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) {
rsmi_status_t ret;
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(b)
GET_DEV_AND_KFDNODE_FROM_INDX
CHK_API_SUPPORT_ONLY((b), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT)
DEVICE_MUTEX
return get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind,
ret = get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind,
&b->transfer_rate, b->lanes);
if (ret == RSMI_STATUS_SUCCESS) {
return ret;
}
// Only fallback to gpu_metric if connecting via PCIe
if (kfd_node->numa_node_type() != amd::smi::IOLINK_TYPE_PCIEXPRESS) {
return ret;
}
rsmi_gpu_metrics_t gpu_metrics;
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
// Hardcode based on PCIe specification: https://en.wikipedia.org/wiki/PCI_Express
const uint32_t link_width[] = {1, 2, 4, 8, 12, 16};
const uint32_t link_speed[] = {25, 50, 80, 160}; // 0.1 Ghz
const uint32_t WIDTH_DATA_LENGTH = sizeof(link_width)/sizeof(uint32_t);
const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t);
// Calculate the index
int width_index = -1;
int speed_index = -1;
uint32_t cur_index = 0;
for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) {
if (link_width[cur_index] == gpu_metrics.pcie_link_width) {
width_index = cur_index;
break;
}
}
for (cur_index = 0;
cur_index < SPEED_DATA_LENGTH; cur_index++) {
if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) {
speed_index = cur_index;
break;
}
}
if (width_index == -1 || speed_index == -1) {
return RSMI_STATUS_NOT_SUPPORTED;
}
// Set possible lanes and frequencies
b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH;
b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index;
for (cur_index = 0;
cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) {
b->transfer_rate.frequency[cur_index]
= link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L;
b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH];
}
/*
frequency = {2500, 2500, 2500, 2500, 2500, 2500,
5000, 5000, 5000, 5000, 5000, 5000,
8000, 8000, 8000, 8000, 8000, 8000,
16000, 16000, 16000, 16000, 16000, 16000}; // Mhz
lanes = {1, 2, 4, 8, 12, 16,
1, 2, 4, 8, 12, 16,
1, 2, 4, 8, 12, 16,
1, 2, 4, 8, 12, 16 }; // For each frequency
*/
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -50,6 +50,7 @@
#include <fstream>
#include <string>
#include <cstring>
#include <cstdint>
#include <iostream>
#include <sstream>