From 65cf46dc7630989abc0a0652875fd63f3b55d32c Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 4 Apr 2024 17:00:50 -0500 Subject: [PATCH 1/6] GIT - Sync dependabot settings with amdsmi Change-Id: Id67a7f5273fd274291a1044dca50cc4006e853a5 Signed-off-by: Galantsev, Dmitrii --- .github/dependabot.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 276690bd4f..8cfb43a90c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,4 +9,10 @@ updates: directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: - interval: "daily" + interval: "monthly" + labels: + - "documentation" + - "dependencies" + - "ci:docs-only" + reviewers: + - "samjwu" From 38d1275d64f06800f3cf46ce198c4ab0cd6bbf42 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Date: Thu, 15 Feb 2024 12:42:54 -0500 Subject: [PATCH 2/6] change CMAKE_HOST_SYSTEM_PROCESSOR to CMAKE_SYSTEM_PROCESSOR Change-Id: I8e379676091903e2af3909e6d90daf6d62b8232c Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9490c73c48..ce86a39643 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ endif() ## Compiler flags set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti") -if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") +if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") endif() From 9e2a6ea4bfb50eb3c576903029111dfb3a39eaf4 Mon Sep 17 00:00:00 2001 From: Junyi Hou Date: Fri, 15 Mar 2024 06:53:27 +0000 Subject: [PATCH 3/6] Fix typos in rocm_smi.py, README.md, rsmiBindings.py Change-Id: Ib03cec6130983a56657a388799fc2afaf3b8f728 Signed-off-by: Galantsev, Dmitrii --- python_smi_tools/README.md | 4 ++-- python_smi_tools/rocm_smi.py | 8 ++++---- python_smi_tools/rsmiBindings.py | 6 +++--- python_smi_tools/rsmiBindings.py.in | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python_smi_tools/README.md b/python_smi_tools/README.md index e2d86311b2..1fa33eba11 100644 --- a/python_smi_tools/README.md +++ b/python_smi_tools/README.md @@ -175,13 +175,13 @@ Set options: --rasenable BLOCK ERRTYPE Enable RAS for specified block and error type --rasdisable BLOCK ERRTYPE Disable RAS for specified block and error type --rasinject BLOCK Inject RAS poison for specified block (ONLY WORKS ON - UNSECURE BOARDS) + UNSECURED BOARDS) Reset options: -r, --resetclocks Reset clocks and OverDrive to default --resetfans Reset fans to automatic (driver) control --resetprofile Reset Power Profile back to default - --resetpoweroverdrive Set the maximum GPU power back to the device deafult + --resetpoweroverdrive Set the maximum GPU power back to the device default state --resetxgmierr Reset XGMI error count --resetperfdeterminism Disable performance determinism diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index ae6435debe..d804f7d054 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -2982,7 +2982,7 @@ def showEvents(deviceList, eventTypes): if len(eventTypeList) == 0: eventTypeList = notification_type_names print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']]) - # Create a seperate thread for each GPU + # Create a separate thread for each GPU for device in deviceList: try: _thread.start_new_thread(printEventList, (device, 1000, eventTypeList)) @@ -3683,7 +3683,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): :param device: DRM device identifier :param my_ret: Return of RSMI call (rocm_smi_lib API) :param metric: Parameter of GPU currently being analyzed - :param silent: Echo verbose error reponse. + :param silent: Echo verbose error response. True silences err output, False does not silence err output (default). """ global RETCODE @@ -3869,7 +3869,7 @@ if __name__ == '__main__': groupActionReset.add_argument('--resetfans', help='Reset fans to automatic (driver) control', action='store_true') groupActionReset.add_argument('--resetprofile', help='Reset Power Profile back to default', action='store_true') groupActionReset.add_argument('--resetpoweroverdrive', - help='Set the maximum GPU power back to the device deafult state', + help='Set the maximum GPU power back to the device default state', action='store_true') groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true') groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true') @@ -3921,7 +3921,7 @@ if __name__ == '__main__': groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2, metavar=('BLOCK', 'ERRTYPE')) groupAction.add_argument('--rasinject', - help='Inject RAS poison for specified block (ONLY WORKS ON UNSECURE BOARDS)', type=str, + help='Inject RAS poison for specified block (ONLY WORKS ON UNSECURED BOARDS)', type=str, metavar='BLOCK', nargs=1) groupActionGpuReset.add_argument('--gpureset', help='Reset specified GPU (One GPU must be specified)', action='store_true') diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py index 6a55370c0a..0483dacf59 100644 --- a/python_smi_tools/rsmiBindings.py +++ b/python_smi_tools/rsmiBindings.py @@ -67,11 +67,11 @@ rsmi_status_verbose_err_out = { rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource', rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught', rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range', - rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization', + rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization', rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup', rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found', rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available', - rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution', + rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution', rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read', rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input', rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received', @@ -639,4 +639,4 @@ rsmi_power_type_dict = { 0: 'AVERAGE', 1: 'CURRENT SOCKET', 0xFFFFFFFF: 'INVALID_POWER_TYPE' -} \ No newline at end of file +} diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in index d6ff405f0c..d53010f4df 100644 --- a/python_smi_tools/rsmiBindings.py.in +++ b/python_smi_tools/rsmiBindings.py.in @@ -99,11 +99,11 @@ rsmi_status_verbose_err_out = { rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource', rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught', rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range', - rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization', + rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization', rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup', rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found', rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available', - rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution', + rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution', rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read', rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input', rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received', @@ -671,4 +671,4 @@ rsmi_power_type_dict = { 0: 'AVERAGE', 1: 'CURRENT SOCKET', 0xFFFFFFFF: 'INVALID_POWER_TYPE' -} \ No newline at end of file +} From 54af22ca614a9349b6a02cfff691501f19365dda Mon Sep 17 00:00:00 2001 From: WhiskyAKM <35374730+PTFOPlayer@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:48:31 +0100 Subject: [PATCH 4/6] Update rocm_smi.h Fix for issue: https://github.com/ROCm/rocm_smi_lib/issues/162 Change-Id: I8778f5b8c034f2289625acb841676c144c967aa3 Signed-off-by: Galantsev, Dmitrii --- include/rocm_smi/rocm_smi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 7fc686958e..74c83ca561 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -1290,7 +1290,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku); +rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); /** * @brief Get the device vendor id associated with the device with provided From b86f92230daa0d562cb7ca7f0c9a5cc03305ce30 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 14 Mar 2024 23:44:57 -0500 Subject: [PATCH 5/6] [SWDEV-450463] Fix --showmemuse clarity * Updates: - [CLI] Updated --showmemuse: -> Add VRAM%, provide better context as "GPU Allocated Memory (VRAM%)" -> Update "GPU memory use (%)" as "GPU Memory Read/Write Activity(%)" - [CLI] Updated --showmaxpower and rocm-smi (no arg) -> Rounding was inconsistent with values past decimal. This provides the floor value of the device Change-Id: Ib76dea2cb8483a1d7f53df675b0a94d8d01c81b9 Signed-off-by: Charis Poag --- python_smi_tools/rocm_smi.py | 54 ++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index d804f7d054..aed3292de6 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -418,9 +418,45 @@ def getMaxPower(device, silent=False): power_cap = c_uint64() ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap)) if rsmi_ret_ok(ret, device, 'get_power_cap', silent): - return power_cap.value / 1000000 + # take floor of result (round down to nearest integer) + return float(power_cap.value / 1000000) // 1 return -1 +def getAllocatedMemoryPercent(device): + """ Return dictionary of allocated memory (VRAM) of a given device + Response of allocated_memory_vram dictionary: + + .. code-block:: python + + { + 'value': float allocated vram memory (floor of %) or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED), + 'unit': %, + 'combined': string (eg. '30%') or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) + 'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED or rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED + } + + :param device: DRM device identifier + """ + allocated_memory_vram = { + 'value': "N/A", + 'unit': '%', + 'combined': "N/A", + 'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED + } + vram_used, vram_total = getMemInfo(device, 'vram', silent=True) + mem_use_pct = 0 + if vram_used is None: + return allocated_memory_vram + if vram_used != None and vram_total != None and float(vram_total) != 0: + # take floor of result (round down to nearest integer) + mem_use_pct = (100 * (float(vram_used) / float(vram_total))) // 1 + allocated_memory_vram['value'] = mem_use_pct + mem_use_pct = '{:<.0f}%'.format(mem_use_pct) # left aligned + # values with no precision + allocated_memory_vram['combined'] = mem_use_pct + allocated_memory_vram['ret'] = rsmi_status_t.RSMI_STATUS_SUCCESS + return allocated_memory_vram + def getMemInfo(device, memType, silent=False): """ Returns a tuple of (memory_used, memory_total) of @@ -1959,14 +1995,7 @@ def showAllConcise(deviceList): gpu_busy = str(getGpuUse(device, silent)) + '%' else: gpu_busy = 'Unsupported' - vram_used, vram_total = getMemInfo(device, 'vram', silent) - mem_use_pct = 0 - if vram_used is None: - mem_use_pct='Unsupported' - if vram_used != None and vram_total != None and float(vram_total) != 0: - mem_use_pct = round(float(100 * (float(vram_used) / float(vram_total)))) - mem_use_pct = '{:<.0f}%'.format(mem_use_pct) # left aligned - # values with no precision + allocated_mem_percent = getAllocatedMemoryPercent(device) # Top Row - per device data values['card%s' % (str(device))] = [device, getNodeId(device), @@ -1976,7 +2005,7 @@ def showAllConcise(deviceList): combined_partition_data, sclk, mclk, fan, str(perf).lower(), str(pwrCap), - str(mem_use_pct), + allocated_mem_percent['combined'], str(gpu_busy)] val_widths = {} @@ -2477,9 +2506,12 @@ def showMemUse(deviceList): avgMemBandwidth = c_uint16() printLogSpacer(' Current Memory Use ') for device in deviceList: + allocated_mem_percent = getAllocatedMemoryPercent(device) + printLog(device, 'GPU Memory Allocated (VRAM%)', + int(allocated_mem_percent['value'])) ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse)) if rsmi_ret_ok(ret, device, '% memory use'): - printLog(device, 'GPU memory use (%)', memoryUse.value) + printLog(device, 'GPU Memory Read/Write Activity (%)', memoryUse.value) util_counters = getCoarseGrainUtil(device, "Memory Activity") if util_counters != -1: for ut_counter in util_counters: From adf5c1da67a0e253b624396af861d0189228a49b Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Wed, 3 Apr 2024 18:10:39 -0500 Subject: [PATCH 6/6] fix: [SWDEV-450058] [rocm/rocm_smi_lib] Fixes TestMeasureApiExecutionTime test fails Code changes related to the following: * Unit tests Change-Id: I6223078f219448deb6bfbd78edae371a5a4cf03c Signed-off-by: Oliveira, Daniel --- .../functional/measure_api_execution_time.cc | 67 +++++++++---------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/tests/rocm_smi_test/functional/measure_api_execution_time.cc b/tests/rocm_smi_test/functional/measure_api_execution_time.cc index 4cbbec7553..6f83e057ef 100644 --- a/tests/rocm_smi_test/functional/measure_api_execution_time.cc +++ b/tests/rocm_smi_test/functional/measure_api_execution_time.cc @@ -43,19 +43,16 @@ * */ -#include -#include - +#include +#include #include #include -#include #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/functional/measure_api_execution_time.h" #include "rocm_smi_test/test_common.h" -#include "rocm_smi_test/test_utils.h" TestMeasureApiExecutionTime::TestMeasureApiExecutionTime() : TestBase() { @@ -92,6 +89,8 @@ void TestMeasureApiExecutionTime::Run(void) { rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT; rsmi_status_t ret; float repeat = 300.0; + constexpr uint32_t kFAN_SPEED_ELAPSED_MS_BASE = (1000); + constexpr uint32_t kMETRICS_ELAPSED_MS_BASE = (1500); bool skip = false; TestBase::Run(); @@ -102,13 +101,15 @@ void TestMeasureApiExecutionTime::Run(void) { return; } + auto test_start = std::chrono::high_resolution_clock::now(); + auto prev = std::cout.precision(3); for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); //test execution time for rsmi_dev_fan_speed_get auto start = std::chrono::high_resolution_clock::now(); - for (int i=0; i < repeat; ++i){ + for (int i=0; i < static_cast(repeat); ++i){ ret = rsmi_dev_fan_speed_get(dv_ind, 0, &val_i64); } @@ -123,14 +124,14 @@ void TestMeasureApiExecutionTime::Run(void) { if (!skip) { std::cout << "\trsmi_dev_fan_speed_get execution time: " << - (float(duration.count()) / repeat) << " microseconds" << std::endl; - EXPECT_LT(duration.count(), 1000 * repeat); + (static_cast(duration.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration.count(), (kFAN_SPEED_ELAPSED_MS_BASE * repeat)); } skip = false; //test execution time for rsmi_dev_temp_metric_get start = std::chrono::high_resolution_clock::now(); - for (int i=0; i < repeat; ++i){ + for (int i=0; i < static_cast(repeat); ++i){ ret = rsmi_dev_temp_metric_get(dv_ind, 0, met, &val_i64); } stop = std::chrono::high_resolution_clock::now(); @@ -142,14 +143,14 @@ void TestMeasureApiExecutionTime::Run(void) { } if (!skip) { std::cout << "\trsmi_dev_temp_metric_get execution time: " << - (float(duration.count()) / repeat ) << " microseconds" << std::endl; - EXPECT_LT(duration.count(), 500 * repeat); + (static_cast(duration.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat)); } skip = false; //test execution time for rsmi_dev_gpu_metrics_info_get start = std::chrono::high_resolution_clock::now(); - for (int i=0; i < repeat; ++i){ + for (int i=0; i < static_cast(repeat); ++i){ ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &smu); } stop = std::chrono::high_resolution_clock::now(); @@ -161,42 +162,36 @@ void TestMeasureApiExecutionTime::Run(void) { } if (!skip) { std::cout << "\trsmi_dev_gpu_metrics_info_get execution time: " << - (float(duration.count()) / repeat ) << " microseconds" << std::endl; - EXPECT_LT(duration.count(), 500 * repeat); + (static_cast(duration.count()) / repeat ) << " microseconds" << std::endl; + EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat)); } skip = false; - std::cout << "----------------------------------------------------------------------------" << std::endl; - auto val_ui16 = uint16_t(0); + auto val_ui16 = static_cast(0); auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); - auto start_api = std::chrono::high_resolution_clock::now(); - for (int i=0; i < repeat; ++i) { + start = std::chrono::high_resolution_clock::now(); + for (int i=0; i < static_cast(repeat); ++i){ status_code = rsmi_dev_metrics_xcd_counter_get(dv_ind, &val_ui16); } - auto stop_api = std::chrono::high_resolution_clock::now(); - auto duration_api = std::chrono::duration_cast(stop_api - start_api); + stop = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(stop - start); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ skip = true; } if (!skip) { - std::cout << "\rsmi_dev_metrics_xcd_counter_get() execution time: " - << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; - EXPECT_LT(duration_api.count(), 500 * repeat); + std::cout << "\trsmi_dev_metrics_xcd_counter_get() execution time: " + << (static_cast(duration.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration.count(), (kMETRICS_ELAPSED_MS_BASE * repeat)); } skip = false; - std::cout << "----------------------------------------------------------------------------" << std::endl; - - stop = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(stop - start); - if (!skip) { - std::cout << "\rTotal execution time (All APIs): " - << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; - EXPECT_LT(duration_api.count(), (500 * repeat)); - } - skip = false; - std::cout << "============================================================================" << std::endl; - } - std::cout.precision(prev); + std::cout.precision(prev); + auto test_stop = std::chrono::high_resolution_clock::now(); + auto test_duration = std::chrono::duration_cast(test_stop - test_start); + + std::cout << "\n" << "============================================================================" << "\n"; + std::cout << " Total execution time (All APIs): " + << (static_cast(test_duration.count()) / repeat) << " microseconds" << "\n"; + std::cout << "============================================================================" << "\n"; }