Added new ecc blocks and adjusted metric --ecc-block filtering

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Ib2f69c7d59ee5108024794434fb202b5e4f58738


[ROCm/amdsmi commit: 1bd18c1a65]
Bu işleme şunda yer alıyor:
Maisam Arif
2024-04-12 12:43:32 -05:00
işlemeyi yapan: Maisam Arif
ebeveyn 9b4f0f1d2b
işleme cebb07e795
8 değiştirilmiş dosya ile 132 ekleme ve 50 silme
+39 -2
Dosyayı Görüntüle
@@ -12,10 +12,47 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
### Changed
- **Removed `amdsmi_get_gpu_process_info` from python library**
- **Updated `amd-smi metric --ecc-blocks` output**
The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for:
``` shell
$ amd-smi metric --ecc-block
GPU: 0
ECC_BLOCKS:
UMC:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
SDMA:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
GFX:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
MMHUB:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
PCIE_BIF:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
HDP:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
XGMI_WAFL:
CORRECTABLE_COUNT: 0
UNCORRECTABLE_COUNT: 0
DEFERRED_COUNT: 0
```
- **Removed `amdsmi_get_gpu_process_info` from python library**
amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface
- **Updated metrics --clocks**
- **Updated metrics --clocks**
Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
``` shell
+3 -2
Dosyayı Görüntüle
@@ -1692,14 +1692,15 @@ class AMDSMICommands():
if "ecc_blocks" in current_platform_args:
if args.ecc_blocks:
ecc_dict = {}
uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "FUSE"]
sysfs_blocks = ["UMC", "SDMA", "GFX", "MMHUB", "PCIE_BIF", "HDP", "XGMI_WAFL"]
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
for state in ras_states:
# Only add enabled blocks that are also in sysfs
if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED.name:
gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']]
# if the blocks are uncountable do not add them at all.
if gpu_block.name not in uncountable_blocks:
if gpu_block.name in sysfs_blocks:
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block)
ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'],
+9 -4
Dosyayı Görüntüle
@@ -964,10 +964,10 @@ typedef enum {
*/
typedef enum {
AMDSMI_GPU_BLOCK_INVALID = 0x0000000000000000, //!< Used to indicate an
//!< invalid block
//!< invalid block
AMDSMI_GPU_BLOCK_FIRST = 0x0000000000000001,
AMDSMI_GPU_BLOCK_UMC = AMDSMI_GPU_BLOCK_FIRST, //!< UMC block
AMDSMI_GPU_BLOCK_UMC = AMDSMI_GPU_BLOCK_FIRST, //!< UMC block
AMDSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block
AMDSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block
AMDSMI_GPU_BLOCK_MMHUB = 0x0000000000000008, //!< MMHUB block
@@ -981,9 +981,14 @@ typedef enum {
AMDSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block
AMDSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block
AMDSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block
AMDSMI_GPU_BLOCK_MCA = 0x0000000000004000, //!< MCA block
AMDSMI_GPU_BLOCK_VCN = 0x0000000000008000, //!< VCN block
AMDSMI_GPU_BLOCK_JPEG = 0x0000000000010000, //!< JPEG block
AMDSMI_GPU_BLOCK_IH = 0x0000000000020000, //!< IH block
AMDSMI_GPU_BLOCK_MPIO = 0x0000000000040000, //!< MPIO block
AMDSMI_GPU_BLOCK_LAST = AMDSMI_GPU_BLOCK_FUSE, //!< The highest bit position
//!< for supported blocks
AMDSMI_GPU_BLOCK_LAST = AMDSMI_GPU_BLOCK_MPIO, //!< The highest bit position
//!< for supported blocks
AMDSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
} amdsmi_gpu_block_t;
+7 -1
Dosyayı Görüntüle
@@ -300,6 +300,11 @@ class AmdSmiGpuBlock(IntEnum):
MP0 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP0
MP1 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP1
FUSE = amdsmi_wrapper.AMDSMI_GPU_BLOCK_FUSE
MCA = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MCA
VCN = amdsmi_wrapper.AMDSMI_GPU_BLOCK_VCN
JPEG = amdsmi_wrapper.AMDSMI_GPU_BLOCK_JPEG
IH = amdsmi_wrapper.AMDSMI_GPU_BLOCK_IH
MPIO = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MPIO
RESERVED = amdsmi_wrapper.AMDSMI_GPU_BLOCK_RESERVED
@@ -1906,7 +1911,7 @@ def amdsmi_get_gpu_ras_block_features_enabled(
if gpu_block.name == "RESERVED" or gpu_block.name == "INVALID":
continue
if gpu_block.name == "LAST":
gpu_block.name = "FUSE"
gpu_block.name = "MPIO"
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_ras_block_features_enabled(
processor_handle,
@@ -1959,6 +1964,7 @@ def amdsmi_get_gpu_process_list(
"vram_mem": process_list[index].memory_usage.vram_mem,
},
})
print(result)
return result
+36 -24
Dosyayı Görüntüle
@@ -748,6 +748,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
class struct_amdsmi_pcie_info_t(Structure):
pass
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
class struct_pcie_metric_(Structure):
pass
@@ -766,19 +779,6 @@ struct_pcie_metric_._fields_ = [
('reserved', ctypes.c_uint64 * 13),
]
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
struct_amdsmi_pcie_info_t._fields_ = [
('pcie_static', struct_pcie_static_),
@@ -1300,7 +1300,12 @@ amdsmi_gpu_block_t__enumvalues = {
2048: 'AMDSMI_GPU_BLOCK_MP0',
4096: 'AMDSMI_GPU_BLOCK_MP1',
8192: 'AMDSMI_GPU_BLOCK_FUSE',
8192: 'AMDSMI_GPU_BLOCK_LAST',
16384: 'AMDSMI_GPU_BLOCK_MCA',
32768: 'AMDSMI_GPU_BLOCK_VCN',
65536: 'AMDSMI_GPU_BLOCK_JPEG',
131072: 'AMDSMI_GPU_BLOCK_IH',
262144: 'AMDSMI_GPU_BLOCK_MPIO',
262144: 'AMDSMI_GPU_BLOCK_LAST',
9223372036854775808: 'AMDSMI_GPU_BLOCK_RESERVED',
}
AMDSMI_GPU_BLOCK_INVALID = 0
@@ -1319,7 +1324,12 @@ AMDSMI_GPU_BLOCK_SEM = 1024
AMDSMI_GPU_BLOCK_MP0 = 2048
AMDSMI_GPU_BLOCK_MP1 = 4096
AMDSMI_GPU_BLOCK_FUSE = 8192
AMDSMI_GPU_BLOCK_LAST = 8192
AMDSMI_GPU_BLOCK_MCA = 16384
AMDSMI_GPU_BLOCK_VCN = 32768
AMDSMI_GPU_BLOCK_JPEG = 65536
AMDSMI_GPU_BLOCK_IH = 131072
AMDSMI_GPU_BLOCK_MPIO = 262144
AMDSMI_GPU_BLOCK_LAST = 262144
AMDSMI_GPU_BLOCK_RESERVED = 9223372036854775808
amdsmi_gpu_block_t = ctypes.c_uint64 # enum
@@ -2380,17 +2390,19 @@ __all__ = \
'AMDSMI_GPU_BLOCK_ATHUB', 'AMDSMI_GPU_BLOCK_DF',
'AMDSMI_GPU_BLOCK_FIRST', 'AMDSMI_GPU_BLOCK_FUSE',
'AMDSMI_GPU_BLOCK_GFX', 'AMDSMI_GPU_BLOCK_HDP',
'AMDSMI_GPU_BLOCK_INVALID', 'AMDSMI_GPU_BLOCK_LAST',
'AMDSMI_GPU_BLOCK_MMHUB', 'AMDSMI_GPU_BLOCK_MP0',
'AMDSMI_GPU_BLOCK_MP1', 'AMDSMI_GPU_BLOCK_PCIE_BIF',
'AMDSMI_GPU_BLOCK_IH', 'AMDSMI_GPU_BLOCK_INVALID',
'AMDSMI_GPU_BLOCK_JPEG', 'AMDSMI_GPU_BLOCK_LAST',
'AMDSMI_GPU_BLOCK_MCA', 'AMDSMI_GPU_BLOCK_MMHUB',
'AMDSMI_GPU_BLOCK_MP0', 'AMDSMI_GPU_BLOCK_MP1',
'AMDSMI_GPU_BLOCK_MPIO', 'AMDSMI_GPU_BLOCK_PCIE_BIF',
'AMDSMI_GPU_BLOCK_RESERVED', 'AMDSMI_GPU_BLOCK_SDMA',
'AMDSMI_GPU_BLOCK_SEM', 'AMDSMI_GPU_BLOCK_SMN',
'AMDSMI_GPU_BLOCK_UMC', 'AMDSMI_GPU_BLOCK_XGMI_WAFL',
'AMDSMI_HSMP_TIMEOUT', 'AMDSMI_INIT_ALL_PROCESSORS',
'AMDSMI_INIT_AMD_APUS', 'AMDSMI_INIT_AMD_CPUS',
'AMDSMI_INIT_AMD_GPUS', 'AMDSMI_INIT_NON_AMD_CPUS',
'AMDSMI_INIT_NON_AMD_GPUS', 'AMDSMI_INVALID_POWER',
'AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES',
'AMDSMI_GPU_BLOCK_UMC', 'AMDSMI_GPU_BLOCK_VCN',
'AMDSMI_GPU_BLOCK_XGMI_WAFL', 'AMDSMI_HSMP_TIMEOUT',
'AMDSMI_INIT_ALL_PROCESSORS', 'AMDSMI_INIT_AMD_APUS',
'AMDSMI_INIT_AMD_CPUS', 'AMDSMI_INIT_AMD_GPUS',
'AMDSMI_INIT_NON_AMD_CPUS', 'AMDSMI_INIT_NON_AMD_GPUS',
'AMDSMI_INVALID_POWER', 'AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES',
'AMDSMI_IOLINK_TYPE_PCIEXPRESS', 'AMDSMI_IOLINK_TYPE_SIZE',
'AMDSMI_IOLINK_TYPE_UNDEFINED', 'AMDSMI_IOLINK_TYPE_XGMI',
'AMDSMI_LINK_TYPE_NOT_APPLICABLE', 'AMDSMI_LINK_TYPE_PCIE',
+6 -1
Dosyayı Görüntüle
@@ -608,8 +608,13 @@ typedef enum {
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block
RSMI_GPU_BLOCK_MCA = 0x0000000000004000, //!< MCA block
RSMI_GPU_BLOCK_VCN = 0x0000000000008000, //!< VCN block
RSMI_GPU_BLOCK_JPEG = 0x0000000000010000, //!< JPEG block
RSMI_GPU_BLOCK_IH = 0x0000000000020000, //!< IH block
RSMI_GPU_BLOCK_MPIO = 0x0000000000040000, //!< MPIO block
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE, //!< The highest bit position
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_MPIO, //!< The highest bit position
//!< for supported blocks
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
} rsmi_gpu_block_t;
+26 -15
Dosyayı Görüntüle
@@ -331,7 +331,13 @@ class rsmi_gpu_block_t(c_int):
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE
RSMI_GPU_BLOCK_MCA = 0x0000000000004000
RSMI_GPU_BLOCK_VCN = 0x0000000000008000
RSMI_GPU_BLOCK_JPEG = 0x0000000000010000
RSMI_GPU_BLOCK_IH = 0x0000000000020000
RSMI_GPU_BLOCK_MPIO = 0x0000000000040000
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_MPIO
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
@@ -340,20 +346,25 @@ rsmi_gpu_block = rsmi_gpu_block_t
# The following dictionary correlates with rsmi_gpu_block_t enum
rsmi_gpu_block_d = {
'UMC' : 0x0000000000000001,
'SDMA' : 0x0000000000000002,
'GFX' : 0x0000000000000004,
'MMHUB': 0x0000000000000008,
'ATHUB': 0x0000000000000010,
'PCIE_BIF': 0x0000000000000020,
'HDP': 0x0000000000000040,
'XGMI_WAFL': 0x0000000000000080,
'DF': 0x0000000000000100,
'SMN': 0x0000000000000200,
'SEM': 0x0000000000000400,
'MP0': 0x0000000000000800,
'MP1': 0x0000000000001000,
'FUSE': 0x0000000000002000
'UMC' : 0x0000000000000001,
'SDMA' : 0x0000000000000002,
'GFX' : 0x0000000000000004,
'MMHUB' : 0x0000000000000008,
'ATHUB' : 0x0000000000000010,
'PCIE_BIF' : 0x0000000000000020,
'HDP' : 0x0000000000000040,
'XGMI_WAFL' : 0x0000000000000080,
'DF' : 0x0000000000000100,
'SMN' : 0x0000000000000200,
'SEM' : 0x0000000000000400,
'MP0' : 0x0000000000000800,
'MP1' : 0x0000000000001000,
'FUSE' : 0x0000000000002000,
'MCA' : 0x0000000000004000,
'VCN' : 0x0000000000008000,
'JPEG' : 0x0000000000010000,
'IH' : 0x0000000000020000,
'MPIO' : 0x0000000000040000,
}
+6 -1
Dosyayı Görüntüle
@@ -91,8 +91,13 @@ static const std::map<amdsmi_gpu_block_t, const char *> kBlockNameMap = {
{AMDSMI_GPU_BLOCK_MP0, "MP0"},
{AMDSMI_GPU_BLOCK_MP1, "MP1"},
{AMDSMI_GPU_BLOCK_FUSE, "FUSE"},
{AMDSMI_GPU_BLOCK_MCA, "MCA"},
{AMDSMI_GPU_BLOCK_VCN, "VCN"},
{AMDSMI_GPU_BLOCK_JPEG, "JPEG"},
{AMDSMI_GPU_BLOCK_IH, "IH"},
{AMDSMI_GPU_BLOCK_MPIO, "MPIO"},
};
static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_FUSE,
static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_MPIO,
"kBlockNameMap needs to be updated");
static const char * kRasErrStateStrings[] = {