Merge amd-staging into amd-master 20240411

Signed-off-by: guanyu12 <guanyu12@amd.com>
Change-Id: I25ed71cca91a0d78110a995861cff93ba748e056
Этот коммит содержится в:
guanyu12
2024-04-11 10:24:26 +08:00
родитель 6fada8c4a6 adf5c1da67
Коммит 6881fc9c2e
8 изменённых файлов: 95 добавлений и 62 удалений
+7 -1
Просмотреть файл
@@ -9,4 +9,10 @@ updates:
directory: "/docs/sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
interval: "monthly"
labels:
- "documentation"
- "dependencies"
- "ci:docs-only"
reviewers:
- "samjwu"
+1 -1
Просмотреть файл
@@ -79,7 +79,7 @@ endif()
## Compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti")
if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -m64 -msse -msse2")
endif()
+1 -1
Просмотреть файл
@@ -1290,7 +1290,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku);
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku);
/**
* @brief Get the device vendor id associated with the device with provided
+2 -2
Просмотреть файл
@@ -175,13 +175,13 @@ Set options:
--rasenable BLOCK ERRTYPE Enable RAS for specified block and error type
--rasdisable BLOCK ERRTYPE Disable RAS for specified block and error type
--rasinject BLOCK Inject RAS poison for specified block (ONLY WORKS ON
UNSECURE BOARDS)
UNSECURED BOARDS)
Reset options:
-r, --resetclocks Reset clocks and OverDrive to default
--resetfans Reset fans to automatic (driver) control
--resetprofile Reset Power Profile back to default
--resetpoweroverdrive Set the maximum GPU power back to the device deafult
--resetpoweroverdrive Set the maximum GPU power back to the device default
state
--resetxgmierr Reset XGMI error count
--resetperfdeterminism Disable performance determinism
+47 -15
Просмотреть файл
@@ -418,9 +418,45 @@ def getMaxPower(device, silent=False):
power_cap = c_uint64()
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if rsmi_ret_ok(ret, device, 'get_power_cap', silent):
return power_cap.value / 1000000
# take floor of result (round down to nearest integer)
return float(power_cap.value / 1000000) // 1
return -1
def getAllocatedMemoryPercent(device):
""" Return dictionary of allocated memory (VRAM) of a given device
Response of allocated_memory_vram dictionary:
.. code-block:: python
{
'value': float allocated vram memory (floor of %) or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED),
'unit': %,
'combined': string (eg. '30%') or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED)
'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED or rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED
}
:param device: DRM device identifier
"""
allocated_memory_vram = {
'value': "N/A",
'unit': '%',
'combined': "N/A",
'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED
}
vram_used, vram_total = getMemInfo(device, 'vram', silent=True)
mem_use_pct = 0
if vram_used is None:
return allocated_memory_vram
if vram_used != None and vram_total != None and float(vram_total) != 0:
# take floor of result (round down to nearest integer)
mem_use_pct = (100 * (float(vram_used) / float(vram_total))) // 1
allocated_memory_vram['value'] = mem_use_pct
mem_use_pct = '{:<.0f}%'.format(mem_use_pct) # left aligned
# values with no precision
allocated_memory_vram['combined'] = mem_use_pct
allocated_memory_vram['ret'] = rsmi_status_t.RSMI_STATUS_SUCCESS
return allocated_memory_vram
def getMemInfo(device, memType, silent=False):
""" Returns a tuple of (memory_used, memory_total) of
@@ -1958,14 +1994,7 @@ def showAllConcise(deviceList):
gpu_busy = str(getGpuUse(device, silent)) + '%'
else:
gpu_busy = 'Unsupported'
vram_used, vram_total = getMemInfo(device, 'vram', silent)
mem_use_pct = 0
if vram_used is None:
mem_use_pct='Unsupported'
if vram_used != None and vram_total != None and float(vram_total) != 0:
mem_use_pct = round(float(100 * (float(vram_used) / float(vram_total))))
mem_use_pct = '{:<.0f}%'.format(mem_use_pct) # left aligned
# values with no precision
allocated_mem_percent = getAllocatedMemoryPercent(device)
# Top Row - per device data
values['card%s' % (str(device))] = [device, getNodeId(device),
@@ -1975,7 +2004,7 @@ def showAllConcise(deviceList):
combined_partition_data,
sclk, mclk, fan, str(perf).lower(),
str(pwrCap),
str(mem_use_pct),
allocated_mem_percent['combined'],
str(gpu_busy)]
val_widths = {}
@@ -2476,9 +2505,12 @@ def showMemUse(deviceList):
avgMemBandwidth = c_uint16()
printLogSpacer(' Current Memory Use ')
for device in deviceList:
allocated_mem_percent = getAllocatedMemoryPercent(device)
printLog(device, 'GPU Memory Allocated (VRAM%)',
int(allocated_mem_percent['value']))
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
if rsmi_ret_ok(ret, device, '% memory use'):
printLog(device, 'GPU memory use (%)', memoryUse.value)
printLog(device, 'GPU Memory Read/Write Activity (%)', memoryUse.value)
util_counters = getCoarseGrainUtil(device, "Memory Activity")
if util_counters != -1:
for ut_counter in util_counters:
@@ -2981,7 +3013,7 @@ def showEvents(deviceList, eventTypes):
if len(eventTypeList) == 0:
eventTypeList = notification_type_names
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
# Create a seperate thread for each GPU
# Create a separate thread for each GPU
for device in deviceList:
try:
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
@@ -3682,7 +3714,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
:param device: DRM device identifier
:param my_ret: Return of RSMI call (rocm_smi_lib API)
:param metric: Parameter of GPU currently being analyzed
:param silent: Echo verbose error reponse.
:param silent: Echo verbose error response.
True silences err output, False does not silence err output (default).
"""
global RETCODE
@@ -3868,7 +3900,7 @@ if __name__ == '__main__':
groupActionReset.add_argument('--resetfans', help='Reset fans to automatic (driver) control', action='store_true')
groupActionReset.add_argument('--resetprofile', help='Reset Power Profile back to default', action='store_true')
groupActionReset.add_argument('--resetpoweroverdrive',
help='Set the maximum GPU power back to the device deafult state',
help='Set the maximum GPU power back to the device default state',
action='store_true')
groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
@@ -3920,7 +3952,7 @@ if __name__ == '__main__':
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
metavar=('BLOCK', 'ERRTYPE'))
groupAction.add_argument('--rasinject',
help='Inject RAS poison for specified block (ONLY WORKS ON UNSECURE BOARDS)', type=str,
help='Inject RAS poison for specified block (ONLY WORKS ON UNSECURED BOARDS)', type=str,
metavar='BLOCK', nargs=1)
groupActionGpuReset.add_argument('--gpureset', help='Reset specified GPU (One GPU must be specified)',
action='store_true')
+3 -3
Просмотреть файл
@@ -67,11 +67,11 @@ rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
@@ -639,4 +639,4 @@ rsmi_power_type_dict = {
0: 'AVERAGE',
1: 'CURRENT SOCKET',
0xFFFFFFFF: 'INVALID_POWER_TYPE'
}
}
+3 -3
Просмотреть файл
@@ -99,11 +99,11 @@ rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
@@ -671,4 +671,4 @@ rsmi_power_type_dict = {
0: 'AVERAGE',
1: 'CURRENT SOCKET',
0xFFFFFFFF: 'INVALID_POWER_TYPE'
}
}
+31 -36
Просмотреть файл
@@ -43,19 +43,16 @@
*
*/
#include <stdint.h>
#include <stddef.h>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <string>
#include <chrono>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/measure_api_execution_time.h"
#include "rocm_smi_test/test_common.h"
#include "rocm_smi_test/test_utils.h"
TestMeasureApiExecutionTime::TestMeasureApiExecutionTime() : TestBase() {
@@ -92,6 +89,8 @@ void TestMeasureApiExecutionTime::Run(void) {
rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT;
rsmi_status_t ret;
float repeat = 300.0;
constexpr uint32_t kFAN_SPEED_ELAPSED_MS_BASE = (1000);
constexpr uint32_t kMETRICS_ELAPSED_MS_BASE = (1500);
bool skip = false;
TestBase::Run();
@@ -102,13 +101,15 @@ void TestMeasureApiExecutionTime::Run(void) {
return;
}
auto test_start = std::chrono::high_resolution_clock::now();
auto prev = std::cout.precision(3);
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);
//test execution time for rsmi_dev_fan_speed_get
auto start = std::chrono::high_resolution_clock::now();
for (int i=0; i < repeat; ++i){
for (int i=0; i < static_cast<int>(repeat); ++i){
ret = rsmi_dev_fan_speed_get(dv_ind, 0, &val_i64);
}
@@ -123,14 +124,14 @@ void TestMeasureApiExecutionTime::Run(void) {
if (!skip) {
std::cout << "\trsmi_dev_fan_speed_get execution time: " <<
(float(duration.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), 1000 * repeat);
(static_cast<float>(duration.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), (kFAN_SPEED_ELAPSED_MS_BASE * repeat));
}
skip = false;
//test execution time for rsmi_dev_temp_metric_get
start = std::chrono::high_resolution_clock::now();
for (int i=0; i < repeat; ++i){
for (int i=0; i < static_cast<int>(repeat); ++i){
ret = rsmi_dev_temp_metric_get(dv_ind, 0, met, &val_i64);
}
stop = std::chrono::high_resolution_clock::now();
@@ -142,14 +143,14 @@ void TestMeasureApiExecutionTime::Run(void) {
}
if (!skip) {
std::cout << "\trsmi_dev_temp_metric_get execution time: " <<
(float(duration.count()) / repeat ) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), 500 * repeat);
(static_cast<float>(duration.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat));
}
skip = false;
//test execution time for rsmi_dev_gpu_metrics_info_get
start = std::chrono::high_resolution_clock::now();
for (int i=0; i < repeat; ++i){
for (int i=0; i < static_cast<int>(repeat); ++i){
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &smu);
}
stop = std::chrono::high_resolution_clock::now();
@@ -161,42 +162,36 @@ void TestMeasureApiExecutionTime::Run(void) {
}
if (!skip) {
std::cout << "\trsmi_dev_gpu_metrics_info_get execution time: " <<
(float(duration.count()) / repeat ) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), 500 * repeat);
(static_cast<float>(duration.count()) / repeat ) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat));
}
skip = false;
std::cout << "----------------------------------------------------------------------------" << std::endl;
auto val_ui16 = uint16_t(0);
auto val_ui16 = static_cast<uint16_t>(0);
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
auto start_api = std::chrono::high_resolution_clock::now();
for (int i=0; i < repeat; ++i) {
start = std::chrono::high_resolution_clock::now();
for (int i=0; i < static_cast<int>(repeat); ++i){
status_code = rsmi_dev_metrics_xcd_counter_get(dv_ind, &val_ui16);
}
auto stop_api = std::chrono::high_resolution_clock::now();
auto duration_api = std::chrono::duration_cast<std::chrono::microseconds>(stop_api - start_api);
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){
skip = true;
}
if (!skip) {
std::cout << "\rsmi_dev_metrics_xcd_counter_get() execution time: "
<< (float(duration_api.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration_api.count(), 500 * repeat);
std::cout << "\trsmi_dev_metrics_xcd_counter_get() execution time: "
<< (static_cast<float>(duration.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat));
}
skip = false;
std::cout << "----------------------------------------------------------------------------" << std::endl;
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
if (!skip) {
std::cout << "\rTotal execution time (All APIs): "
<< (float(duration_api.count()) / repeat) << " microseconds" << std::endl;
EXPECT_LT(duration_api.count(), (500 * repeat));
}
skip = false;
std::cout << "============================================================================" << std::endl;
}
std::cout.precision(prev);
std::cout.precision(prev);
auto test_stop = std::chrono::high_resolution_clock::now();
auto test_duration = std::chrono::duration_cast<std::chrono::microseconds>(test_stop - test_start);
std::cout << "\n" << "============================================================================" << "\n";
std::cout << " Total execution time (All APIs): "
<< (static_cast<float>(test_duration.count()) / repeat) << " microseconds" << "\n";
std::cout << "============================================================================" << "\n";
}