diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 03ee2bbe02..4d09bb5ea9 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -12,10 +12,47 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed -- **Removed `amdsmi_get_gpu_process_info` from python library** +- **Updated `amd-smi metric --ecc-blocks` output** +The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: + +``` shell +$ amd-smi metric --ecc-block +GPU: 0 + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + SDMA: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + GFX: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + MMHUB: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + PCIE_BIF: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + HDP: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + XGMI_WAFL: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 +``` + +- **Removed `amdsmi_get_gpu_process_info` from python library** amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface -- **Updated metrics --clocks** +- **Updated metrics --clocks** Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. ``` shell diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 31c32f0aa4..17427ff34e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1692,14 +1692,15 @@ class AMDSMICommands(): if "ecc_blocks" in current_platform_args: if args.ecc_blocks: ecc_dict = {} - uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "FUSE"] + sysfs_blocks = ["UMC", "SDMA", "GFX", "MMHUB", "PCIE_BIF", "HDP", "XGMI_WAFL"] try: ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) for state in ras_states: + # Only add enabled blocks that are also in sysfs if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED.name: gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']] # if the blocks are uncountable do not add them at all. - if gpu_block.name not in uncountable_blocks: + if gpu_block.name in sysfs_blocks: try: ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block) ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'], diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index ba73c093b9..c5adb70252 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -964,10 +964,10 @@ typedef enum { */ typedef enum { AMDSMI_GPU_BLOCK_INVALID = 0x0000000000000000, //!< Used to indicate an - //!< invalid block + //!< invalid block AMDSMI_GPU_BLOCK_FIRST = 0x0000000000000001, - AMDSMI_GPU_BLOCK_UMC = AMDSMI_GPU_BLOCK_FIRST, //!< UMC block + AMDSMI_GPU_BLOCK_UMC = AMDSMI_GPU_BLOCK_FIRST, //!< UMC block AMDSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block AMDSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block AMDSMI_GPU_BLOCK_MMHUB = 0x0000000000000008, //!< MMHUB block @@ -981,9 +981,14 @@ typedef enum { AMDSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block AMDSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block AMDSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block + AMDSMI_GPU_BLOCK_MCA = 0x0000000000004000, //!< MCA block + AMDSMI_GPU_BLOCK_VCN = 0x0000000000008000, //!< VCN block + AMDSMI_GPU_BLOCK_JPEG = 0x0000000000010000, //!< JPEG block + AMDSMI_GPU_BLOCK_IH = 0x0000000000020000, //!< IH block + AMDSMI_GPU_BLOCK_MPIO = 0x0000000000040000, //!< MPIO block - AMDSMI_GPU_BLOCK_LAST = AMDSMI_GPU_BLOCK_FUSE, //!< The highest bit position - //!< for supported blocks + AMDSMI_GPU_BLOCK_LAST = AMDSMI_GPU_BLOCK_MPIO, //!< The highest bit position + //!< for supported blocks AMDSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 } amdsmi_gpu_block_t; diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 6245f194a3..31bf22adae 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -300,6 +300,11 @@ class AmdSmiGpuBlock(IntEnum): MP0 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP0 MP1 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP1 FUSE = amdsmi_wrapper.AMDSMI_GPU_BLOCK_FUSE + MCA = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MCA + VCN = amdsmi_wrapper.AMDSMI_GPU_BLOCK_VCN + JPEG = amdsmi_wrapper.AMDSMI_GPU_BLOCK_JPEG + IH = amdsmi_wrapper.AMDSMI_GPU_BLOCK_IH + MPIO = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MPIO RESERVED = amdsmi_wrapper.AMDSMI_GPU_BLOCK_RESERVED @@ -1906,7 +1911,7 @@ def amdsmi_get_gpu_ras_block_features_enabled( if gpu_block.name == "RESERVED" or gpu_block.name == "INVALID": continue if gpu_block.name == "LAST": - gpu_block.name = "FUSE" + gpu_block.name = "MPIO" _check_res( amdsmi_wrapper.amdsmi_get_gpu_ras_block_features_enabled( processor_handle, @@ -1959,6 +1964,7 @@ def amdsmi_get_gpu_process_list( "vram_mem": process_list[index].memory_usage.vram_mem, }, }) + print(result) return result diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index d9116193fc..06ae08ce18 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -748,6 +748,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -766,19 +779,6 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1300,7 +1300,12 @@ amdsmi_gpu_block_t__enumvalues = { 2048: 'AMDSMI_GPU_BLOCK_MP0', 4096: 'AMDSMI_GPU_BLOCK_MP1', 8192: 'AMDSMI_GPU_BLOCK_FUSE', - 8192: 'AMDSMI_GPU_BLOCK_LAST', + 16384: 'AMDSMI_GPU_BLOCK_MCA', + 32768: 'AMDSMI_GPU_BLOCK_VCN', + 65536: 'AMDSMI_GPU_BLOCK_JPEG', + 131072: 'AMDSMI_GPU_BLOCK_IH', + 262144: 'AMDSMI_GPU_BLOCK_MPIO', + 262144: 'AMDSMI_GPU_BLOCK_LAST', 9223372036854775808: 'AMDSMI_GPU_BLOCK_RESERVED', } AMDSMI_GPU_BLOCK_INVALID = 0 @@ -1319,7 +1324,12 @@ AMDSMI_GPU_BLOCK_SEM = 1024 AMDSMI_GPU_BLOCK_MP0 = 2048 AMDSMI_GPU_BLOCK_MP1 = 4096 AMDSMI_GPU_BLOCK_FUSE = 8192 -AMDSMI_GPU_BLOCK_LAST = 8192 +AMDSMI_GPU_BLOCK_MCA = 16384 +AMDSMI_GPU_BLOCK_VCN = 32768 +AMDSMI_GPU_BLOCK_JPEG = 65536 +AMDSMI_GPU_BLOCK_IH = 131072 +AMDSMI_GPU_BLOCK_MPIO = 262144 +AMDSMI_GPU_BLOCK_LAST = 262144 AMDSMI_GPU_BLOCK_RESERVED = 9223372036854775808 amdsmi_gpu_block_t = ctypes.c_uint64 # enum @@ -2380,17 +2390,19 @@ __all__ = \ 'AMDSMI_GPU_BLOCK_ATHUB', 'AMDSMI_GPU_BLOCK_DF', 'AMDSMI_GPU_BLOCK_FIRST', 'AMDSMI_GPU_BLOCK_FUSE', 'AMDSMI_GPU_BLOCK_GFX', 'AMDSMI_GPU_BLOCK_HDP', - 'AMDSMI_GPU_BLOCK_INVALID', 'AMDSMI_GPU_BLOCK_LAST', - 'AMDSMI_GPU_BLOCK_MMHUB', 'AMDSMI_GPU_BLOCK_MP0', - 'AMDSMI_GPU_BLOCK_MP1', 'AMDSMI_GPU_BLOCK_PCIE_BIF', + 'AMDSMI_GPU_BLOCK_IH', 'AMDSMI_GPU_BLOCK_INVALID', + 'AMDSMI_GPU_BLOCK_JPEG', 'AMDSMI_GPU_BLOCK_LAST', + 'AMDSMI_GPU_BLOCK_MCA', 'AMDSMI_GPU_BLOCK_MMHUB', + 'AMDSMI_GPU_BLOCK_MP0', 'AMDSMI_GPU_BLOCK_MP1', + 'AMDSMI_GPU_BLOCK_MPIO', 'AMDSMI_GPU_BLOCK_PCIE_BIF', 'AMDSMI_GPU_BLOCK_RESERVED', 'AMDSMI_GPU_BLOCK_SDMA', 'AMDSMI_GPU_BLOCK_SEM', 'AMDSMI_GPU_BLOCK_SMN', - 'AMDSMI_GPU_BLOCK_UMC', 'AMDSMI_GPU_BLOCK_XGMI_WAFL', - 'AMDSMI_HSMP_TIMEOUT', 'AMDSMI_INIT_ALL_PROCESSORS', - 'AMDSMI_INIT_AMD_APUS', 'AMDSMI_INIT_AMD_CPUS', - 'AMDSMI_INIT_AMD_GPUS', 'AMDSMI_INIT_NON_AMD_CPUS', - 'AMDSMI_INIT_NON_AMD_GPUS', 'AMDSMI_INVALID_POWER', - 'AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES', + 'AMDSMI_GPU_BLOCK_UMC', 'AMDSMI_GPU_BLOCK_VCN', + 'AMDSMI_GPU_BLOCK_XGMI_WAFL', 'AMDSMI_HSMP_TIMEOUT', + 'AMDSMI_INIT_ALL_PROCESSORS', 'AMDSMI_INIT_AMD_APUS', + 'AMDSMI_INIT_AMD_CPUS', 'AMDSMI_INIT_AMD_GPUS', + 'AMDSMI_INIT_NON_AMD_CPUS', 'AMDSMI_INIT_NON_AMD_GPUS', + 'AMDSMI_INVALID_POWER', 'AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES', 'AMDSMI_IOLINK_TYPE_PCIEXPRESS', 'AMDSMI_IOLINK_TYPE_SIZE', 'AMDSMI_IOLINK_TYPE_UNDEFINED', 'AMDSMI_IOLINK_TYPE_XGMI', 'AMDSMI_LINK_TYPE_NOT_APPLICABLE', 'AMDSMI_LINK_TYPE_PCIE', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index b6420d7933..3749690067 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -608,8 +608,13 @@ typedef enum { RSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block RSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block RSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block + RSMI_GPU_BLOCK_MCA = 0x0000000000004000, //!< MCA block + RSMI_GPU_BLOCK_VCN = 0x0000000000008000, //!< VCN block + RSMI_GPU_BLOCK_JPEG = 0x0000000000010000, //!< JPEG block + RSMI_GPU_BLOCK_IH = 0x0000000000020000, //!< IH block + RSMI_GPU_BLOCK_MPIO = 0x0000000000040000, //!< MPIO block - RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE, //!< The highest bit position + RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_MPIO, //!< The highest bit position //!< for supported blocks RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 } rsmi_gpu_block_t; diff --git a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py index 884793468f..94d493d7ea 100644 --- a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py +++ b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py @@ -331,7 +331,13 @@ class rsmi_gpu_block_t(c_int): RSMI_GPU_BLOCK_MP0 = 0x0000000000000800 RSMI_GPU_BLOCK_MP1 = 0x0000000000001000 RSMI_GPU_BLOCK_FUSE = 0x0000000000002000 - RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE + RSMI_GPU_BLOCK_MCA = 0x0000000000004000 + RSMI_GPU_BLOCK_VCN = 0x0000000000008000 + RSMI_GPU_BLOCK_JPEG = 0x0000000000010000 + RSMI_GPU_BLOCK_IH = 0x0000000000020000 + RSMI_GPU_BLOCK_MPIO = 0x0000000000040000 + + RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_MPIO RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 @@ -340,20 +346,25 @@ rsmi_gpu_block = rsmi_gpu_block_t # The following dictionary correlates with rsmi_gpu_block_t enum rsmi_gpu_block_d = { - 'UMC' : 0x0000000000000001, - 'SDMA' : 0x0000000000000002, - 'GFX' : 0x0000000000000004, - 'MMHUB': 0x0000000000000008, - 'ATHUB': 0x0000000000000010, - 'PCIE_BIF': 0x0000000000000020, - 'HDP': 0x0000000000000040, - 'XGMI_WAFL': 0x0000000000000080, - 'DF': 0x0000000000000100, - 'SMN': 0x0000000000000200, - 'SEM': 0x0000000000000400, - 'MP0': 0x0000000000000800, - 'MP1': 0x0000000000001000, - 'FUSE': 0x0000000000002000 + 'UMC' : 0x0000000000000001, + 'SDMA' : 0x0000000000000002, + 'GFX' : 0x0000000000000004, + 'MMHUB' : 0x0000000000000008, + 'ATHUB' : 0x0000000000000010, + 'PCIE_BIF' : 0x0000000000000020, + 'HDP' : 0x0000000000000040, + 'XGMI_WAFL' : 0x0000000000000080, + 'DF' : 0x0000000000000100, + 'SMN' : 0x0000000000000200, + 'SEM' : 0x0000000000000400, + 'MP0' : 0x0000000000000800, + 'MP1' : 0x0000000000001000, + 'FUSE' : 0x0000000000002000, + 'MCA' : 0x0000000000004000, + 'VCN' : 0x0000000000008000, + 'JPEG' : 0x0000000000010000, + 'IH' : 0x0000000000020000, + 'MPIO' : 0x0000000000040000, } diff --git a/projects/amdsmi/tests/amd_smi_test/test_common.cc b/projects/amdsmi/tests/amd_smi_test/test_common.cc index 7237e4cc89..669c8186f1 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_common.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_common.cc @@ -91,8 +91,13 @@ static const std::map kBlockNameMap = { {AMDSMI_GPU_BLOCK_MP0, "MP0"}, {AMDSMI_GPU_BLOCK_MP1, "MP1"}, {AMDSMI_GPU_BLOCK_FUSE, "FUSE"}, + {AMDSMI_GPU_BLOCK_MCA, "MCA"}, + {AMDSMI_GPU_BLOCK_VCN, "VCN"}, + {AMDSMI_GPU_BLOCK_JPEG, "JPEG"}, + {AMDSMI_GPU_BLOCK_IH, "IH"}, + {AMDSMI_GPU_BLOCK_MPIO, "MPIO"}, }; -static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_FUSE, +static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_MPIO, "kBlockNameMap needs to be updated"); static const char * kRasErrStateStrings[] = {