# Copyright (C) Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of # the Software, and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import ctypes import math import os import re import sys from collections.abc import Iterable from ctypes import POINTER, c_void_p from enum import IntEnum, Enum from pathlib import Path from time import asctime, localtime, time from typing import Any, Dict, List, Tuple, Union from . import amdsmi_wrapper from .amdsmi_exception import * ### Non Library Specific Constants ### class MaxUIntegerTypes(IntEnum): UINT8_T = 0xFF UINT16_T = 0xFFFF UINT32_T = 0xFFFFFFFF UINT64_T = 0xFFFFFFFFFFFFFFFF NO_OF_32BITS = (sys.getsizeof(ctypes.c_uint32) * 8) NO_OF_64BITS = (sys.getsizeof(ctypes.c_uint64) * 8) KILO = math.pow(10, 3) socket_handle_t = c_void_p processor_handle_t = c_void_p ############################### MAX_NUM_PROCESSES = 1024 # gpu metrics macros defined in amdsmi.h AMDSMI_NUM_HBM_INSTANCES = 4 AMDSMI_MAX_NUM_VCN = 4 AMDSMI_MAX_NUM_CLKS = 4 AMDSMI_MAX_NUM_XGMI_LINKS = 8 AMDSMI_MAX_NUM_GFX_CLKS = 8 AMDSMI_MAX_AID = 4 AMDSMI_MAX_ENGINES = 8 AMDSMI_MAX_NUM_JPEG = 32 AMDSMI_MAX_NUM_XCC = 8 AMDSMI_MAX_NUM_XCP = 8 # max num afids per cper record MAX_NUMBER_OF_AFIDS_PER_RECORD = 12 # Max number of DPM policies AMDSMI_MAX_NUM_PM_POLICIES = 32 # Max supported frequencies AMDSMI_MAX_NUM_FREQUENCIES = 33 # Max Fan speed AMDSMI_MAX_FAN_SPEED = 255 # Max Votlage Curve Points AMDSMI_NUM_VOLTAGE_CURVE_POINTS = 3 # Max size definitions AMDSMI_MAX_MM_IP_COUNT = 8 AMDSMI_MAX_STRING_LENGTH = 256 AMDSMI_MAX_DEVICES = 32 AMDSMI_MAX_CONTAINER_TYPE = 2 AMDSMI_MAX_CACHE_TYPES = 10 AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK = 64 AMDSMI_GPU_UUID_SIZE = 38 _AMDSMI_STRING_LENGTH = 80 class AmdSmiStatus(IntEnum): SUCCESS = amdsmi_wrapper.AMDSMI_STATUS_SUCCESS INVAL = amdsmi_wrapper.AMDSMI_STATUS_INVAL NOT_SUPPORTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED NOT_YET_IMPLEMENTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_YET_IMPLEMENTED FAIL_LOAD_MODULE = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_MODULE FAIL_LOAD_SYMBOL = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_SYMBOL DRM_ERROR = amdsmi_wrapper.AMDSMI_STATUS_DRM_ERROR API_FAILED = amdsmi_wrapper.AMDSMI_STATUS_API_FAILED TIMEOUT = amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT RETRY = amdsmi_wrapper.AMDSMI_STATUS_RETRY NO_PERM = amdsmi_wrapper.AMDSMI_STATUS_NO_PERM INTERRUPT = amdsmi_wrapper.AMDSMI_STATUS_INTERRUPT IO = amdsmi_wrapper.AMDSMI_STATUS_IO ADDRESS_FAULT = amdsmi_wrapper.AMDSMI_STATUS_ADDRESS_FAULT FILE_ERROR = amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR OUT_OF_RESOURCES = amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES INTERNAL_EXCEPTION = amdsmi_wrapper.AMDSMI_STATUS_INTERNAL_EXCEPTION INPUT_OUT_OF_BOUNDS = amdsmi_wrapper.AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS INIT_ERROR = amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR REFCOUNT_OVERFLOW = amdsmi_wrapper.AMDSMI_STATUS_REFCOUNT_OVERFLOW DIRECTORY_NOT_FOUND = amdsmi_wrapper.AMDSMI_STATUS_DIRECTORY_NOT_FOUND BUSY = amdsmi_wrapper.AMDSMI_STATUS_BUSY NOT_FOUND = amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND NOT_INIT = amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT NO_SLOT = amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT DRIVER_NOT_LOADED = amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED MORE_DATA = amdsmi_wrapper.AMDSMI_STATUS_MORE_DATA NO_DATA = amdsmi_wrapper.AMDSMI_STATUS_NO_DATA INSUFFICIENT_SIZE = amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE UNEXPECTED_SIZE = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE UNEXPECTED_DATA = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA NON_AMD_CPU = amdsmi_wrapper.AMDSMI_STATUS_NON_AMD_CPU NO_ENERGY_DRV = amdsmi_wrapper.AMDSMI_STATUS_NO_ENERGY_DRV NO_MSR_DRV = amdsmi_wrapper.AMDSMI_STATUS_NO_MSR_DRV NO_HSMP_DRV = amdsmi_wrapper.AMDSMI_STATUS_NO_HSMP_DRV NO_HSMP_SUP = amdsmi_wrapper.AMDSMI_STATUS_NO_HSMP_SUP NO_HSMP_MSG_SUP = amdsmi_wrapper.AMDSMI_STATUS_NO_HSMP_MSG_SUP HSMP_TIMEOUT = amdsmi_wrapper.AMDSMI_STATUS_HSMP_TIMEOUT NO_DRV = amdsmi_wrapper.AMDSMI_STATUS_NO_DRV FILE_NOT_FOUND = amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND ARG_PTR_NULL = amdsmi_wrapper.AMDSMI_STATUS_ARG_PTR_NULL AMDGPU_RESTART_ERR = amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR SETTING_UNAVAILABLE = amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE CORRUPTED_EEPROM = amdsmi_wrapper.AMDSMI_STATUS_CORRUPTED_EEPROM MAP_ERROR = amdsmi_wrapper.AMDSMI_STATUS_MAP_ERROR UNKNOWN_ERROR = amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR class AmdSmiInitFlags(IntEnum): INIT_ALL_PROCESSORS = amdsmi_wrapper.AMDSMI_INIT_ALL_PROCESSORS INIT_AMD_CPUS = amdsmi_wrapper.AMDSMI_INIT_AMD_CPUS INIT_AMD_GPUS = amdsmi_wrapper.AMDSMI_INIT_AMD_GPUS INIT_AMD_APUS = amdsmi_wrapper.AMDSMI_INIT_AMD_APUS INIT_NON_AMD_CPUS = amdsmi_wrapper.AMDSMI_INIT_NON_AMD_CPUS INIT_NON_AMD_GPUS = amdsmi_wrapper.AMDSMI_INIT_NON_AMD_GPUS class AmdSmiContainerTypes(IntEnum): LXC = amdsmi_wrapper.AMDSMI_CONTAINER_LXC DOCKER = amdsmi_wrapper.AMDSMI_CONTAINER_DOCKER class AmdSmiDeviceType(IntEnum): UNKNOWN_DEVICE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_UNKNOWN AMD_GPU_DEVICE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_GPU AMD_CPU_DEVICE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_CPU NON_AMD_GPU_DEVICE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_NON_AMD_GPU NON_AMD_CPU_DEVICE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_NON_AMD_CPU class AmdSmiMmIp(IntEnum): UVD = amdsmi_wrapper.AMDSMI_MM_UVD VCE = amdsmi_wrapper.AMDSMI_MM_VCE VCN = amdsmi_wrapper.AMDSMI_MM_VCN class AmdSmiFwBlock(IntEnum): AMDSMI_FW_ID_SMU = amdsmi_wrapper.AMDSMI_FW_ID_SMU AMDSMI_FW_ID_CP_CE = amdsmi_wrapper.AMDSMI_FW_ID_CP_CE AMDSMI_FW_ID_CP_PFP = amdsmi_wrapper.AMDSMI_FW_ID_CP_PFP AMDSMI_FW_ID_CP_ME = amdsmi_wrapper.AMDSMI_FW_ID_CP_ME AMDSMI_FW_ID_CP_MEC_JT1 = amdsmi_wrapper.AMDSMI_FW_ID_CP_MEC_JT1 AMDSMI_FW_ID_CP_MEC_JT2 = amdsmi_wrapper.AMDSMI_FW_ID_CP_MEC_JT2 AMDSMI_FW_ID_CP_MEC1 = amdsmi_wrapper.AMDSMI_FW_ID_CP_MEC1 AMDSMI_FW_ID_CP_MEC2 = amdsmi_wrapper.AMDSMI_FW_ID_CP_MEC2 AMDSMI_FW_ID_RLC = amdsmi_wrapper.AMDSMI_FW_ID_RLC AMDSMI_FW_ID_SDMA0 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA0 AMDSMI_FW_ID_SDMA1 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA1 AMDSMI_FW_ID_SDMA2 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA2 AMDSMI_FW_ID_SDMA3 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA3 AMDSMI_FW_ID_SDMA4 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA4 AMDSMI_FW_ID_SDMA5 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA5 AMDSMI_FW_ID_SDMA6 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA6 AMDSMI_FW_ID_SDMA7 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA7 AMDSMI_FW_ID_VCN = amdsmi_wrapper.AMDSMI_FW_ID_VCN AMDSMI_FW_ID_UVD = amdsmi_wrapper.AMDSMI_FW_ID_UVD AMDSMI_FW_ID_VCE = amdsmi_wrapper.AMDSMI_FW_ID_VCE AMDSMI_FW_ID_ISP = amdsmi_wrapper.AMDSMI_FW_ID_ISP AMDSMI_FW_ID_DMCU_ERAM = amdsmi_wrapper.AMDSMI_FW_ID_DMCU_ERAM AMDSMI_FW_ID_DMCU_ISR = amdsmi_wrapper.AMDSMI_FW_ID_DMCU_ISR AMDSMI_FW_ID_RLC_RESTORE_LIST_GPM_MEM = amdsmi_wrapper.AMDSMI_FW_ID_RLC_RESTORE_LIST_GPM_MEM AMDSMI_FW_ID_RLC_RESTORE_LIST_SRM_MEM = amdsmi_wrapper.AMDSMI_FW_ID_RLC_RESTORE_LIST_SRM_MEM AMDSMI_FW_ID_RLC_RESTORE_LIST_CNTL = amdsmi_wrapper.AMDSMI_FW_ID_RLC_RESTORE_LIST_CNTL AMDSMI_FW_ID_RLC_V = amdsmi_wrapper.AMDSMI_FW_ID_RLC_V AMDSMI_FW_ID_MMSCH = amdsmi_wrapper.AMDSMI_FW_ID_MMSCH AMDSMI_FW_ID_PSP_SYSDRV = amdsmi_wrapper.AMDSMI_FW_ID_PSP_SYSDRV AMDSMI_FW_ID_PSP_SOSDRV = amdsmi_wrapper.AMDSMI_FW_ID_PSP_SOSDRV AMDSMI_FW_ID_PSP_TOC = amdsmi_wrapper.AMDSMI_FW_ID_PSP_TOC AMDSMI_FW_ID_PSP_KEYDB = amdsmi_wrapper.AMDSMI_FW_ID_PSP_KEYDB AMDSMI_FW_ID_DFC = amdsmi_wrapper.AMDSMI_FW_ID_DFC AMDSMI_FW_ID_PSP_SPL = amdsmi_wrapper.AMDSMI_FW_ID_PSP_SPL AMDSMI_FW_ID_DRV_CAP = amdsmi_wrapper.AMDSMI_FW_ID_DRV_CAP AMDSMI_FW_ID_MC = amdsmi_wrapper.AMDSMI_FW_ID_MC AMDSMI_FW_ID_PSP_BL = amdsmi_wrapper.AMDSMI_FW_ID_PSP_BL AMDSMI_FW_ID_CP_PM4 = amdsmi_wrapper.AMDSMI_FW_ID_CP_PM4 AMDSMI_FW_ID_RLC_P = amdsmi_wrapper.AMDSMI_FW_ID_RLC_P AMDSMI_FW_ID_SEC_POLICY_STAGE2 = amdsmi_wrapper.AMDSMI_FW_ID_SEC_POLICY_STAGE2 AMDSMI_FW_ID_REG_ACCESS_WHITELIST = amdsmi_wrapper.AMDSMI_FW_ID_REG_ACCESS_WHITELIST AMDSMI_FW_ID_IMU_DRAM = amdsmi_wrapper.AMDSMI_FW_ID_IMU_DRAM AMDSMI_FW_ID_IMU_IRAM = amdsmi_wrapper.AMDSMI_FW_ID_IMU_IRAM AMDSMI_FW_ID_SDMA_TH0 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA_TH0 AMDSMI_FW_ID_SDMA_TH1 = amdsmi_wrapper.AMDSMI_FW_ID_SDMA_TH1 AMDSMI_FW_ID_CP_MES = amdsmi_wrapper.AMDSMI_FW_ID_CP_MES AMDSMI_FW_ID_MES_STACK = amdsmi_wrapper.AMDSMI_FW_ID_MES_STACK AMDSMI_FW_ID_MES_THREAD1 = amdsmi_wrapper.AMDSMI_FW_ID_MES_THREAD1 AMDSMI_FW_ID_MES_THREAD1_STACK = amdsmi_wrapper.AMDSMI_FW_ID_MES_THREAD1_STACK AMDSMI_FW_ID_RLX6 = amdsmi_wrapper.AMDSMI_FW_ID_RLX6 AMDSMI_FW_ID_RLX6_DRAM_BOOT = amdsmi_wrapper.AMDSMI_FW_ID_RLX6_DRAM_BOOT AMDSMI_FW_ID_RS64_ME = amdsmi_wrapper.AMDSMI_FW_ID_RS64_ME AMDSMI_FW_ID_RS64_ME_P0_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_ME_P0_DATA AMDSMI_FW_ID_RS64_ME_P1_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_ME_P1_DATA AMDSMI_FW_ID_RS64_PFP = amdsmi_wrapper.AMDSMI_FW_ID_RS64_PFP AMDSMI_FW_ID_RS64_PFP_P0_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_PFP_P0_DATA AMDSMI_FW_ID_RS64_PFP_P1_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_PFP_P1_DATA AMDSMI_FW_ID_RS64_MEC = amdsmi_wrapper.AMDSMI_FW_ID_RS64_MEC AMDSMI_FW_ID_RS64_MEC_P0_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_MEC_P0_DATA AMDSMI_FW_ID_RS64_MEC_P1_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_MEC_P1_DATA AMDSMI_FW_ID_RS64_MEC_P2_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_MEC_P2_DATA AMDSMI_FW_ID_RS64_MEC_P3_DATA = amdsmi_wrapper.AMDSMI_FW_ID_RS64_MEC_P3_DATA AMDSMI_FW_ID_PPTABLE = amdsmi_wrapper.AMDSMI_FW_ID_PPTABLE AMDSMI_FW_ID_PSP_SOC = amdsmi_wrapper.AMDSMI_FW_ID_PSP_SOC AMDSMI_FW_ID_PSP_DBG = amdsmi_wrapper.AMDSMI_FW_ID_PSP_DBG AMDSMI_FW_ID_PSP_INTF = amdsmi_wrapper.AMDSMI_FW_ID_PSP_INTF AMDSMI_FW_ID_RLX6_CORE1 = amdsmi_wrapper.AMDSMI_FW_ID_RLX6_CORE1 AMDSMI_FW_ID_RLX6_DRAM_BOOT_CORE1 = amdsmi_wrapper.AMDSMI_FW_ID_RLX6_DRAM_BOOT_CORE1 AMDSMI_FW_ID_RLCV_LX7 = amdsmi_wrapper.AMDSMI_FW_ID_RLCV_LX7 AMDSMI_FW_ID_RLC_SAVE_RESTORE_LIST = amdsmi_wrapper.AMDSMI_FW_ID_RLC_SAVE_RESTORE_LIST AMDSMI_FW_ID_ASD = amdsmi_wrapper.AMDSMI_FW_ID_ASD AMDSMI_FW_ID_TA_RAS = amdsmi_wrapper.AMDSMI_FW_ID_TA_RAS AMDSMI_FW_ID_TA_XGMI = amdsmi_wrapper.AMDSMI_FW_ID_TA_XGMI AMDSMI_FW_ID_RLC_SRLG = amdsmi_wrapper.AMDSMI_FW_ID_RLC_SRLG AMDSMI_FW_ID_RLC_SRLS = amdsmi_wrapper.AMDSMI_FW_ID_RLC_SRLS AMDSMI_FW_ID_PM = amdsmi_wrapper.AMDSMI_FW_ID_PM AMDSMI_FW_ID_DMCU = amdsmi_wrapper.AMDSMI_FW_ID_DMCU AMDSMI_FW_ID_PLDM_BUNDLE = amdsmi_wrapper.AMDSMI_FW_ID_PLDM_BUNDLE class AmdSmiClkType(IntEnum): SYS = amdsmi_wrapper.AMDSMI_CLK_TYPE_SYS GFX = amdsmi_wrapper.AMDSMI_CLK_TYPE_GFX DF = amdsmi_wrapper.AMDSMI_CLK_TYPE_DF DCEF = amdsmi_wrapper.AMDSMI_CLK_TYPE_DCEF SOC = amdsmi_wrapper.AMDSMI_CLK_TYPE_SOC MEM = amdsmi_wrapper.AMDSMI_CLK_TYPE_MEM PCIE = amdsmi_wrapper.AMDSMI_CLK_TYPE_PCIE VCLK0 = amdsmi_wrapper.AMDSMI_CLK_TYPE_VCLK0 VCLK1 = amdsmi_wrapper.AMDSMI_CLK_TYPE_VCLK1 DCLK0 = amdsmi_wrapper.AMDSMI_CLK_TYPE_DCLK0 DCLK1 = amdsmi_wrapper.AMDSMI_CLK_TYPE_DCLK1 class AmdSmiClkLimitType(IntEnum): MIN = amdsmi_wrapper.CLK_LIMIT_MIN MAX = amdsmi_wrapper.CLK_LIMIT_MAX class AmdSmiTemperatureType(IntEnum): EDGE = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_EDGE HOTSPOT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HOTSPOT JUNCTION = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_JUNCTION VRAM = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_VRAM HBM_0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_0 HBM_1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_1 HBM_2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_2 HBM_3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_3 PLX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_PLX # GPU Board Node temperature GPUBOARD_NODE_RETIMER_X = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X # Retimer X temperature GPUBOARD_NODE_OAM_X_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC # OAM X IBC temperature GPUBOARD_NODE_OAM_X_IBC_2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2 # OAM X IBC 2 temperature GPUBOARD_NODE_OAM_X_VDD18_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR # OAM X VDD 1.8V voltage regulator temperature GPUBOARD_NODE_OAM_X_04_HBM_B_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR # OAM X 0.4V HBM B voltage regulator temperature GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST # GPU Board VR (Voltage Regulator) temperature GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature GPUBOARD_VDDCR_VDD3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3 # VDDCR VDD3 voltage regulator temperature GPUBOARD_VDDCR_SOC_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A # VDDCR SOC A voltage regulator temperature GPUBOARD_VDDCR_SOC_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C # VDDCR SOC C voltage regulator temperature GPUBOARD_VDDCR_SOCIO_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A # VDDCR SOCIO A voltage regulator temperature GPUBOARD_VDDCR_SOCIO_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C # VDDCR SOCIO C voltage regulator temperature GPUBOARD_VDD_085_HBM = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM # VDD 0.85V HBM voltage regulator temperature GPUBOARD_VDDCR_11_HBM_B = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B # VDDCR 1.1V HBM B voltage regulator temperature GPUBOARD_VDDCR_11_HBM_D = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D # VDDCR 1.1V HBM D voltage regulator temperature GPUBOARD_VDD_USR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR # VDD USR voltage regulator temperature GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST # Baseboard System temperature BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature BASEBOARD_UBB_OAM7 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7 # UBB OAM7 temperature BASEBOARD_UBB_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC # UBB IBC temperature BASEBOARD_UBB_UFPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA # UBB UFPGA temperature BASEBOARD_UBB_OAM1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1 # UBB OAM1 temperature BASEBOARD_OAM_0_1_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC # OAM 0-1 HSC temperature BASEBOARD_OAM_2_3_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC # OAM 2-3 HSC temperature BASEBOARD_OAM_4_5_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC # OAM 4-5 HSC temperature BASEBOARD_OAM_6_7_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC # OAM 6-7 HSC temperature BASEBOARD_UBB_FPGA_0V72_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR # UBB FPGA 0.72V voltage regulator temperature BASEBOARD_UBB_FPGA_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR # UBB FPGA 3.3V voltage regulator temperature BASEBOARD_RETIMER_0_1_2_3_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR # Retimer 0-1-2-3 1.2V voltage regulator temperature BASEBOARD_RETIMER_4_5_6_7_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR # Retimer 4-5-6-7 1.2V voltage regulator temperature BASEBOARD_RETIMER_0_1_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR # Retimer 0-1 0.9V voltage regulator temperature BASEBOARD_RETIMER_4_5_0V9_VR= amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR # Retimer 4-5 0.9V voltage regulator temperature BASEBOARD_RETIMER_2_3_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR # Retimer 2-3 0.9V voltage regulator temperature BASEBOARD_RETIMER_6_7_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR # Retimer 6-7 0.9V voltage regulator temperature BASEBOARD_OAM_0_1_2_3_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR # OAM 0-1-2-3 3.3V voltage regulator temperature BASEBOARD_OAM_4_5_6_7_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR # OAM 4-5-6-7 3.3V voltage regulator temperature BASEBOARD_IBC_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC # IBC HSC temperature BASEBOARD_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC # IBC temperature BASEBOARD_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST BASEBOARD__MAX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE__MAX # Maximum per GPU temperature type class AmdSmiDevPerfLevel(IntEnum): AUTO = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_AUTO LOW = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_LOW HIGH = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_HIGH MANUAL = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_MANUAL STABLE_STD = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_STABLE_STD STABLE_PEAK = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_STABLE_PEAK STABLE_MIN_MCLK = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK STABLE_MIN_SCLK = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK DETERMINISM = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_DETERMINISM UNKNOWN = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_UNKNOWN class AmdSmiEventGroup(IntEnum): XGMI = amdsmi_wrapper.AMDSMI_EVNT_GRP_XGMI XGMI_DATA_OUT = amdsmi_wrapper.AMDSMI_EVNT_GRP_XGMI_DATA_OUT GRP_INVALID = amdsmi_wrapper.AMDSMI_EVNT_GRP_INVALID class AmdSmiEventType(IntEnum): XGMI_0_NOP_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_0_NOP_TX XGMI_0_REQUEST_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_0_REQUEST_TX XGMI_0_RESPONSE_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_0_RESPONSE_TX XGMI_0_BEATS_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_0_BEATS_TX XGMI_1_NOP_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_1_NOP_TX XGMI_1_REQUEST_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_1_REQUEST_TX XGMI_1_RESPONSE_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_1_RESPONSE_TX XGMI_1_BEATS_TX = amdsmi_wrapper.AMDSMI_EVNT_XGMI_1_BEATS_TX XGMI_DATA_OUT_0 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_0 XGMI_DATA_OUT_1 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_1 XGMI_DATA_OUT_2 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_2 XGMI_DATA_OUT_3 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_3 XGMI_DATA_OUT_4 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_4 XGMI_DATA_OUT_5 = amdsmi_wrapper.AMDSMI_EVNT_XGMI_DATA_OUT_5 class AmdSmiCounterCommand(IntEnum): CMD_START = amdsmi_wrapper.AMDSMI_CNTR_CMD_START CMD_STOP = amdsmi_wrapper.AMDSMI_CNTR_CMD_STOP class AmdSmiEvtNotificationType(IntEnum): NONE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_NONE VMFAULT = amdsmi_wrapper.AMDSMI_EVT_NOTIF_VMFAULT THERMAL_THROTTLE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_THERMAL_THROTTLE GPU_PRE_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_PRE_RESET GPU_POST_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_POST_RESET MIGRATE_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_MIGRATE_START MIGRATE_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_MIGRATE_END PAGE_FAULT_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PAGE_FAULT_END PAGE_FAULT_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PAGE_FAULT_END QUEUE_EVICTION = amdsmi_wrapper.AMDSMI_EVT_NOTIF_QUEUE_EVICTION QUEUE_RESTORE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_QUEUE_RESTORE UNMAP_FROM_GPU = amdsmi_wrapper.AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU PROCESS_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_START PROCESS_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_END class AmdSmiTemperatureMetric(IntEnum): CURRENT = amdsmi_wrapper.AMDSMI_TEMP_CURRENT MAX = amdsmi_wrapper.AMDSMI_TEMP_MAX MIN = amdsmi_wrapper.AMDSMI_TEMP_MIN MAX_HYST = amdsmi_wrapper.AMDSMI_TEMP_MAX_HYST MIN_HYST = amdsmi_wrapper.AMDSMI_TEMP_MIN_HYST CRITICAL = amdsmi_wrapper.AMDSMI_TEMP_CRITICAL CRITICAL_HYST = amdsmi_wrapper.AMDSMI_TEMP_CRITICAL_HYST EMERGENCY = amdsmi_wrapper.AMDSMI_TEMP_EMERGENCY EMERGENCY_HYST = amdsmi_wrapper.AMDSMI_TEMP_EMERGENCY_HYST CRIT_MIN = amdsmi_wrapper.AMDSMI_TEMP_CRIT_MIN CRIT_MIN_HYST = amdsmi_wrapper.AMDSMI_TEMP_CRIT_MIN_HYST OFFSET = amdsmi_wrapper.AMDSMI_TEMP_OFFSET LOWEST = amdsmi_wrapper.AMDSMI_TEMP_LOWEST HIGHEST = amdsmi_wrapper.AMDSMI_TEMP_HIGHEST class AmdSmiVoltageMetric(IntEnum): CURRENT = amdsmi_wrapper.AMDSMI_VOLT_CURRENT MAX = amdsmi_wrapper.AMDSMI_VOLT_MAX MIN_CRIT = amdsmi_wrapper.AMDSMI_VOLT_MIN_CRIT MIN = amdsmi_wrapper.AMDSMI_VOLT_MIN MAX_CRIT = amdsmi_wrapper.AMDSMI_VOLT_MAX_CRIT AVERAGE = amdsmi_wrapper.AMDSMI_VOLT_AVERAGE LOWEST = amdsmi_wrapper.AMDSMI_VOLT_LOWEST HIGHEST = amdsmi_wrapper.AMDSMI_VOLT_HIGHEST class AmdSmiVoltageType(IntEnum): VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX VDDBOARD = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDBOARD INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID class AmdSmiAcceleratorPartitionResourceType(IntEnum): XCC = amdsmi_wrapper.AMDSMI_ACCELERATOR_XCC ENCODER = amdsmi_wrapper.AMDSMI_ACCELERATOR_ENCODER DECODER = amdsmi_wrapper.AMDSMI_ACCELERATOR_DECODER DMA = amdsmi_wrapper.AMDSMI_ACCELERATOR_DMA JPEG = amdsmi_wrapper.AMDSMI_ACCELERATOR_JPEG MAX = amdsmi_wrapper.AMDSMI_ACCELERATOR_MAX class AmdSmiAcceleratorPartitionType(IntEnum): SPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_SPX DPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_DPX TPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_TPX QPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_QPX CPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_CPX INVALID = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_INVALID class AmdSmiComputePartitionType(IntEnum): SPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_SPX DPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_DPX TPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_TPX QPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_QPX CPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_CPX INVALID = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_INVALID class AmdSmiMemoryPartitionType(IntEnum): NPS1 = amdsmi_wrapper.AMDSMI_MEMORY_PARTITION_NPS1 NPS2 = amdsmi_wrapper.AMDSMI_MEMORY_PARTITION_NPS2 NPS4 = amdsmi_wrapper.AMDSMI_MEMORY_PARTITION_NPS4 NPS8 = amdsmi_wrapper.AMDSMI_MEMORY_PARTITION_NPS8 UNKNOWN = amdsmi_wrapper.AMDSMI_MEMORY_PARTITION_UNKNOWN class AmdSmiPowerProfilePresetMasks(IntEnum): CUSTOM_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_CUSTOM_MASK VIDEO_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_VIDEO_MASK POWER_SAVING_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_POWER_SAVING_MASK COMPUTE_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_COMPUTE_MASK VR_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_VR_MASK THREE_D_FULL_SCR_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK BOOTUP_DEFAULT = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT INVALID = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_INVALID class AmdSmiGpuBlock(IntEnum): INVALID = amdsmi_wrapper.AMDSMI_GPU_BLOCK_INVALID UMC = amdsmi_wrapper.AMDSMI_GPU_BLOCK_UMC SDMA = amdsmi_wrapper.AMDSMI_GPU_BLOCK_SDMA GFX = amdsmi_wrapper.AMDSMI_GPU_BLOCK_GFX MMHUB = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MMHUB ATHUB = amdsmi_wrapper.AMDSMI_GPU_BLOCK_ATHUB PCIE_BIF = amdsmi_wrapper.AMDSMI_GPU_BLOCK_PCIE_BIF HDP = amdsmi_wrapper.AMDSMI_GPU_BLOCK_HDP XGMI_WAFL = amdsmi_wrapper.AMDSMI_GPU_BLOCK_XGMI_WAFL DF = amdsmi_wrapper.AMDSMI_GPU_BLOCK_DF SMN = amdsmi_wrapper.AMDSMI_GPU_BLOCK_SMN SEM = amdsmi_wrapper.AMDSMI_GPU_BLOCK_SEM MP0 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP0 MP1 = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MP1 FUSE = amdsmi_wrapper.AMDSMI_GPU_BLOCK_FUSE MCA = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MCA VCN = amdsmi_wrapper.AMDSMI_GPU_BLOCK_VCN JPEG = amdsmi_wrapper.AMDSMI_GPU_BLOCK_JPEG IH = amdsmi_wrapper.AMDSMI_GPU_BLOCK_IH MPIO = amdsmi_wrapper.AMDSMI_GPU_BLOCK_MPIO RESERVED = amdsmi_wrapper.AMDSMI_GPU_BLOCK_RESERVED class AmdSmiRasErrState(IntEnum): NONE = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_NONE DISABLED = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_DISABLED PARITY = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_PARITY SING_C = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_SING_C MULT_UC = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_MULT_UC POISON = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_POISON ENABLED = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_ENABLED INVALID = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_INVALID class AmdSmiCperNotifyType(Enum): CMC = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CMC CPE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CPE MCE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_MCE PCIE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_PCIE INIT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_INIT NMI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_NMI BOOT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_BOOT DMAr = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_DMAR SEA = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_SEA SEI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_SEI PEI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_PEI CXL_COMPONENT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT class AmdSmiMemoryType(IntEnum): VRAM = amdsmi_wrapper.AMDSMI_MEM_TYPE_VRAM VIS_VRAM = amdsmi_wrapper.AMDSMI_MEM_TYPE_VIS_VRAM GTT = amdsmi_wrapper.AMDSMI_MEM_TYPE_GTT class AmdSmiFreqInd(IntEnum): MIN = amdsmi_wrapper.AMDSMI_FREQ_IND_MIN MAX = amdsmi_wrapper.AMDSMI_FREQ_IND_MAX INVALID = amdsmi_wrapper.AMDSMI_FREQ_IND_INVALID class AmdSmiXgmiStatus(IntEnum): NO_ERRORS = amdsmi_wrapper.AMDSMI_XGMI_STATUS_NO_ERRORS ERROR = amdsmi_wrapper.AMDSMI_XGMI_STATUS_ERROR MULTIPLE_ERRORS = amdsmi_wrapper.AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS class AmdSmiMemoryPageStatus(IntEnum): RESERVED = amdsmi_wrapper.AMDSMI_MEM_PAGE_STATUS_RESERVED PENDING = amdsmi_wrapper.AMDSMI_MEM_PAGE_STATUS_PENDING UNRESERVABLE = amdsmi_wrapper.AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE class AmdSmiLinkType(IntEnum): AMDSMI_LINK_TYPE_INTERNAL = amdsmi_wrapper.AMDSMI_LINK_TYPE_INTERNAL AMDSMI_LINK_TYPE_XGMI = amdsmi_wrapper.AMDSMI_LINK_TYPE_XGMI AMDSMI_LINK_TYPE_PCIE = amdsmi_wrapper.AMDSMI_LINK_TYPE_PCIE AMDSMI_LINK_TYPE_NOT_APPLICABLE = amdsmi_wrapper.AMDSMI_LINK_TYPE_NOT_APPLICABLE AMDSMI_LINK_TYPE_UNKNOWN = amdsmi_wrapper.AMDSMI_LINK_TYPE_UNKNOWN class AmdSmiUtilizationCounterType(IntEnum): COARSE_GRAIN_GFX_ACTIVITY = amdsmi_wrapper.AMDSMI_COARSE_GRAIN_GFX_ACTIVITY COARSE_GRAIN_MEM_ACTIVITY = amdsmi_wrapper.AMDSMI_COARSE_GRAIN_MEM_ACTIVITY COARSE_DECODER_ACTIVITY = amdsmi_wrapper.AMDSMI_COARSE_DECODER_ACTIVITY FINE_GRAIN_GFX_ACTIVITY = amdsmi_wrapper.AMDSMI_FINE_GRAIN_GFX_ACTIVITY FINE_GRAIN_MEM_ACTIVITY = amdsmi_wrapper.AMDSMI_FINE_GRAIN_MEM_ACTIVITY FINE_DECODER_ACTIVITY = amdsmi_wrapper.AMDSMI_FINE_DECODER_ACTIVITY UTILIZATION_COUNTER_FIRST = amdsmi_wrapper.AMDSMI_UTILIZATION_COUNTER_FIRST UTILIZATION_COUNTER_LAST = amdsmi_wrapper.AMDSMI_UTILIZATION_COUNTER_LAST class AmdSmiProcessorType(IntEnum): UNKNOWN = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_UNKNOWN AMD_GPU = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_GPU AMD_CPU = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_CPU NON_AMD_GPU = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_NON_AMD_CPU AMD_CPU_CORE = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE AMD_APU = amdsmi_wrapper.AMDSMI_PROCESSOR_TYPE_AMD_APU class AmdSmiRegType(IntEnum): XGMI = amdsmi_wrapper.AMDSMI_REG_XGMI WAFL = amdsmi_wrapper.AMDSMI_REG_WAFL PCIE = amdsmi_wrapper.AMDSMI_REG_PCIE USR = amdsmi_wrapper.AMDSMI_REG_USR USR1 = amdsmi_wrapper.AMDSMI_REG_USR1 class AmdSmiVirtualizationMode(IntEnum): UNKNOWN = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_UNKNOWN BAREMETAL = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_BAREMETAL HOST = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_HOST GUEST = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_GUEST PASSTHROUGH = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH class AmdSmiVramType(IntEnum): UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_TYPE_UNKNOWN HBM = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM HBM2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM2 HBM2E = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM2E HBM3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM3 DDR2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR2 DDR3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR3 DDR4 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR4 GDDR1 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR1 GDDR2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR2 GDDR3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR3 GDDR4 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR4 GDDR5 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR5 GDDR6 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR6 GDDR7 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR7 MAX = amdsmi_wrapper.AMDSMI_VRAM_TYPE__MAX class AmdSmiAffinityScope(IntEnum): NUMA_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_NODE SOCKET_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_SOCKET class AmdSmiPtlData(IntEnum): I8 = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_I8 F16 = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_F16 BF16 = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_BF16 F32 = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_F32 F64 = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_F64 INVALID = amdsmi_wrapper.AMDSMI_PTL_DATA_FORMAT_INVALID class AmdSmiPowerCapType(IntEnum): PPT0 = amdsmi_wrapper.AMDSMI_POWER_CAP_TYPE_PPT0 PPT1 = amdsmi_wrapper.AMDSMI_POWER_CAP_TYPE_PPT1 class AmdSmiEventReader: def __init__( self, processor_handle: processor_handle_t, event_types: List[AmdSmiEvtNotificationType] ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(event_types, Iterable): raise AmdSmiParameterException( event_types, Iterable ) for event_type in event_types: if not isinstance(event_type, AmdSmiEvtNotificationType): raise AmdSmiParameterException( event_type, AmdSmiEvtNotificationType ) self.processor_handle = processor_handle mask = 0 for event_type in event_types: if event_type != AmdSmiEvtNotificationType.NONE: mask |= (1 << (int(event_type) - 1)) _check_res(amdsmi_wrapper.amdsmi_init_gpu_event_notification(processor_handle)) _check_res(amdsmi_wrapper.amdsmi_set_gpu_event_notification_mask( processor_handle, ctypes.c_uint64(mask))) def read(self, timestamp, num_elem=10): c_count = ctypes.c_uint32(num_elem) event_info = (amdsmi_wrapper.amdsmi_evt_notification_data_t * num_elem)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_event_notification( ctypes.c_int(timestamp), ctypes.byref(c_count), event_info, ) ) ret = [] for i in range(c_count.value): unique_event_values = set(event.value for event in AmdSmiEvtNotificationType) if event_info[i].event in unique_event_values: if AmdSmiEvtNotificationType(event_info[i].event).name != "NONE": processor_handle = amdsmi_wrapper.amdsmi_processor_handle(event_info[i].processor_handle) ret.append( { "processor_handle": processor_handle, "event": AmdSmiEvtNotificationType(event_info[i].event).name, "message": event_info[i].message.decode("utf-8"), } ) return ret def stop(self): _check_res(amdsmi_wrapper.amdsmi_stop_gpu_event_notification( self.processor_handle)) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.stop() def _format_bad_page_info(bad_page_info, bad_page_count: ctypes.c_uint32) -> List[Dict]: """ Format bad page info data retrieved. Parameters: bad_page_info(`amdsmi_retired_page_record_t`): A populated list of amdsmi_retired_page_record_t(s) retrieved. Ex: (amdsmi_wrapper.amdsmi_retired_page_record_t * #)() bad_page_count(`c_uint32`): Bad page count. Returns: `list`: List containing formatted bad pages. Can be empty """ if bad_page_count == 0: return [] # Check if each struct within bad_page_info is valid for bad_page in bad_page_info: if not isinstance(bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t): raise AmdSmiParameterException( bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t ) table_records = [] for i in range(bad_page_count.value): table_records.append( { "value": i, "page_address": bad_page_info[i].page_address, "page_size": bad_page_info[i].page_size, "status": bad_page_info[i].status, } ) return table_records def _format_bdf(amdsmi_bdf: amdsmi_wrapper.amdsmi_bdf_t) -> str: """ Format BDF struct to readable data. Parameters: amdsmi_bdf(`amdsmi_bdf_t`): Struct containing BDF data that will be formatted. Returns: `str`: String containing BDF data in a readable format. """ domain = hex(amdsmi_bdf.struct_amdsmi_bdf_t.domain_number)[2:].zfill(4) bus = hex(amdsmi_bdf.struct_amdsmi_bdf_t.bus_number)[2:].zfill(2) device = hex(amdsmi_bdf.struct_amdsmi_bdf_t.device_number)[2:].zfill(2) function = hex(amdsmi_bdf.struct_amdsmi_bdf_t.function_number)[2:] return domain + ":" + bus + ":" + device + "." + function def _check_res(ret_code) -> None: """ Wrapper for amdsmi function calls. Checks the status returned by the call. Raises exceptions if the status was inappropriate. Parameters: ret_code(`amdsmi_status_t`): Status code returned by function call. Returns: `None`. """ if ret_code == amdsmi_wrapper.AMDSMI_STATUS_RETRY: raise AmdSmiRetryException() if ret_code == amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT: raise AmdSmiTimeoutException() if ret_code != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: raise AmdSmiLibraryException(ret_code) def _parse_bdf(bdf): if bdf is None: return None extended_regex = re.compile( r'^([0-9a-fA-F]{4}):([0-9a-fA-F]{2}):([0-1][0-9a-fA-F])\.([0-7])$') if extended_regex.match(bdf) is None: simple_regex = re.compile( r'^([0-9a-fA-F]{2}):([0-1][0-9a-fA-F])\.([0-7])$') if simple_regex.match(bdf) is None: return None else: match = simple_regex.match(bdf) if match: return [0] + [int(x, 16) for x in match.groups()] else: return None else: match = extended_regex.match(bdf) if match: return [int(x, 16) for x in match.groups()] return None def _make_amdsmi_bdf_from_list(bdf): if len(bdf) != 4: return None amdsmi_bdf = amdsmi_wrapper.amdsmi_bdf_t() amdsmi_bdf.struct_amdsmi_bdf_t.function_number = bdf[3] amdsmi_bdf.struct_amdsmi_bdf_t.device_number = bdf[2] amdsmi_bdf.struct_amdsmi_bdf_t.bus_number = bdf[1] amdsmi_bdf.struct_amdsmi_bdf_t.domain_number = bdf[0] return amdsmi_bdf def _pad_hex_value(value, length) -> str: """ Pad a hexadecimal value with a given length of zeros :param value: A hexadecimal value to be padded with zeros :param length: Number of zeros to pad the hexadecimal value :param return original string string or padded hex of confirmed hex output (using length provided) """ # Ensure value entered meets the minimum length and is hexadecimal if len(value) > 2 and length > 1 and value[:2].lower() == '0x' \ and all(c in '0123456789abcdefABCDEF' for c in value[2:]): # Pad with zeros after '0x' prefix return '0x' + value[2:].zfill(length) return value def _validate_if_max_uint(value, uint_type: MaxUIntegerTypes, isActivity=False, isBool=False) -> Union[str, bool, int, list]: return_val = "N/A" if not isinstance(value, list): if (value == uint_type) or (isActivity and value > 100): return return_val if isBool: return bool(value) return value else: return_val = [] for _, v in enumerate(value): if (v == uint_type) or (isActivity and v > 100): return_val.append("N/A") else: return_val.append(v) if isBool: return bool(return_val) return return_val def _notifyTypeToString(notify_type_b): guid = [] # Iterate over only the first 8 bytes, but backwards for i in notify_type_b[7::-1]: guid.append(format(i, '02x')) hex_string = "".join(guid) hex_value = int(hex_string, 16) if hex_value in AmdSmiCperNotifyType._value2member_map_: # Convert to the corresponding enum name return AmdSmiCperNotifyType(hex_value).name else: return "Unknown" def _NA_amdsmi_get_gpu_metrics_info() -> Dict[str, str]: """ Get 'N/A' metric values for gpu_metric, used for exception handling. Parameters: None Returns: Dict[str, str]: A dictionary with keys as metric names and values as 'N/A'. This is used to indicate that the metric is not available or applicable. Raises: N/A """ na_gpu_metrics_info = { "common_header.structure_size": "N/A", "common_header.format_revision": "N/A", "common_header.content_revision": "N/A", "temperature_edge": "N/A", "temperature_hotspot": "N/A", "temperature_mem": "N/A", "temperature_vrgfx": "N/A", "temperature_vrsoc": "N/A", "temperature_vrmem": "N/A", "average_gfx_activity": "N/A", "average_umc_activity": "N/A", "average_mm_activity": "N/A", "average_socket_power": "N/A", "energy_accumulator": "N/A", "system_clock_counter": "N/A", "average_gfxclk_frequency": "N/A", "average_socclk_frequency": "N/A", "average_uclk_frequency": "N/A", "average_vclk0_frequency": "N/A", "average_dclk0_frequency": "N/A", "average_vclk1_frequency": "N/A", "average_dclk1_frequency": "N/A", "current_gfxclk": "N/A", "current_socclk": "N/A", "current_uclk": "N/A", "current_vclk0": "N/A", "current_dclk0": "N/A", "current_vclk1": "N/A", "current_dclk1": "N/A", "throttle_status": "N/A", "current_fan_speed": "N/A", "pcie_link_width": "N/A", "pcie_link_speed": "N/A", "gfx_activity_acc": "N/A", "mem_activity_acc": "N/A", "temperature_hbm": "N/A", "firmware_timestamp": "N/A", "voltage_soc": "N/A", "voltage_gfx": "N/A", "voltage_mem": "N/A", "indep_throttle_status": "N/A", "current_socket_power": "N/A", "vcn_activity": "N/A", "gfxclk_lock_status": "N/A", "xgmi_link_width": "N/A", "xgmi_link_speed": "N/A", "pcie_bandwidth_acc": "N/A", "pcie_bandwidth_inst": "N/A", "pcie_l0_to_recov_count_acc": "N/A", "pcie_replay_count_acc": "N/A", "pcie_replay_rover_count_acc": "N/A", "xgmi_read_data_acc": "N/A", "xgmi_write_data_acc": "N/A", "current_gfxclks": "N/A", "current_socclks": "N/A", "current_vclk0s": "N/A", "current_dclk0s": "N/A", "jpeg_activity": "N/A", "pcie_nak_sent_count_acc": "N/A", "pcie_nak_rcvd_count_acc": "N/A", "accumulation_counter": "N/A", "prochot_residency_acc": "N/A", "ppt_residency_acc": "N/A", "socket_thm_residency_acc": "N/A", "vr_thm_residency_acc": "N/A", "hbm_thm_residency_acc": "N/A", "num_partition": "N/A", "xcp_stats.gfx_busy_inst": "N/A", "xcp_stats.jpeg_busy": "N/A", "xcp_stats.vcn_busy": "N/A", "xcp_stats.gfx_busy_acc": "N/A", "xcp_stats.gfx_below_host_limit_acc": "N/A", "xcp_stats.gfx_below_host_limit_ppt_acc": "N/A", "xcp_stats.gfx_below_host_limit_thm_acc": "N/A", "xcp_stats.gfx_low_utilization_acc": "N/A", "xcp_stats.gfx_below_host_limit_total_acc": "N/A", "pcie_lc_perf_other_end_recovery": "N/A", "vram_max_bandwidth": "N/A", "xgmi_link_status": "N/A" } return na_gpu_metrics_info def amdsmi_get_socket_handles() -> List[c_void_p]: """ Function that gets socket handles. Wraps the same named function call. Parameters: `None`. Returns: `List`: List containing all of the found socket handles. """ socket_count = ctypes.c_uint32(0) null_ptr = POINTER(amdsmi_wrapper.amdsmi_socket_handle)() _check_res( amdsmi_wrapper.amdsmi_get_socket_handles( ctypes.byref(socket_count), null_ptr) ) socket_handles = (amdsmi_wrapper.amdsmi_socket_handle * socket_count.value)() _check_res( amdsmi_wrapper.amdsmi_get_socket_handles( ctypes.byref(socket_count), socket_handles) ) sockets = [ amdsmi_wrapper.amdsmi_socket_handle(socket_handles[sock_idx]) for sock_idx in range(socket_count.value) ] return sockets def amdsmi_get_cpusocket_handles() -> List[c_void_p]: """ Function that gets cpu socket handles. Wraps the same named function call. Parameters: `None`. Returns: `List`: List containing all of the found cpu socket handles. """ cpu_count = ctypes.c_uint32(0) null_ptr = POINTER(amdsmi_wrapper.amdsmi_processor_handle)() _check_res( amdsmi_wrapper.amdsmi_get_cpu_handles( ctypes.byref(cpu_count), null_ptr) ) proc_handles = (amdsmi_wrapper.amdsmi_processor_handle * cpu_count.value)() _check_res( amdsmi_wrapper.amdsmi_get_cpu_handles( ctypes.byref(cpu_count), proc_handles) ) cpu_handles = [ amdsmi_wrapper.amdsmi_processor_handle(proc_handles[sock_idx]) for sock_idx in range(cpu_count.value) ] return cpu_handles def amdsmi_get_socket_info(socket_handle): if not isinstance(socket_handle, amdsmi_wrapper.amdsmi_socket_handle): raise AmdSmiParameterException( socket_handle, amdsmi_wrapper.amdsmi_socket_handle) socket_info = ctypes.create_string_buffer(128) _check_res( amdsmi_wrapper.amdsmi_get_socket_info( socket_handle, ctypes.c_size_t(128), socket_info) ) return socket_info.value.decode() def amdsmi_get_processor_info(processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle) processor_info = ctypes.create_string_buffer(128) _check_res( amdsmi_wrapper.amdsmi_get_processor_info( processor_handle, ctypes.c_size_t(128), processor_info) ) return processor_info.value.decode() def amdsmi_get_processor_handles() -> List[c_void_p]: socket_handles = amdsmi_get_socket_handles() devices = [] for socket in socket_handles: device_count = ctypes.c_uint32() null_ptr = POINTER(amdsmi_wrapper.amdsmi_processor_handle)() _check_res( amdsmi_wrapper.amdsmi_get_processor_handles( socket, ctypes.byref(device_count), null_ptr, ) ) processor_handles = ( amdsmi_wrapper.amdsmi_processor_handle * device_count.value)() _check_res( amdsmi_wrapper.amdsmi_get_processor_handles( socket, ctypes.byref(device_count), processor_handles, ) ) devices.extend( [ amdsmi_wrapper.amdsmi_processor_handle(processor_handles[dev_idx]) for dev_idx in range(device_count.value) ] ) return devices def amdsmi_get_cpucore_handles() -> List[c_void_p]: cores_count = ctypes.c_uint32(0) null_ptr = POINTER(amdsmi_wrapper.amdsmi_processor_handle)() _check_res( amdsmi_wrapper.amdsmi_get_cpucore_handles( ctypes.byref(cores_count), null_ptr) ) proc_handles = (amdsmi_wrapper.amdsmi_processor_handle * cores_count.value)() _check_res( amdsmi_wrapper.amdsmi_get_cpucore_handles( ctypes.byref(cores_count), proc_handles) ) core_handles = [ amdsmi_wrapper.amdsmi_processor_handle(proc_handles[sock_idx]) for sock_idx in range(cores_count.value) ] return core_handles def amdsmi_get_cpu_hsmp_proto_ver(processor_handle: processor_handle_t) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) proto_ver = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_hsmp_proto_ver( processor_handle, ctypes.byref(proto_ver) ) ) return proto_ver.value def amdsmi_get_cpu_smu_fw_version( processor_handle: processor_handle_t) -> Dict[str, int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) smu_fw = amdsmi_wrapper.amdsmi_smu_fw_version_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_smu_fw_version(processor_handle, smu_fw)) return { "smu_fw_debug_ver_num": smu_fw.debug, "smu_fw_minor_ver_num": smu_fw.minor, "smu_fw_major_ver_num": smu_fw.major } def amdsmi_get_cpu_hsmp_driver_version( processor_handle: processor_handle_t) -> Dict[str, int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) hsmp_driver_version = amdsmi_wrapper.amdsmi_hsmp_driver_version_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_hsmp_driver_version(processor_handle, hsmp_driver_version)) return { "hsmp_driver_major_ver_num": hsmp_driver_version.major, "hsmp_driver_minor_ver_num": hsmp_driver_version.minor, } def amdsmi_get_cpu_core_energy( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) penergy = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_cpu_core_energy( processor_handle, ctypes.byref(penergy) ) ) return f"{float(penergy.value * pow(10, -6))} J" def amdsmi_get_cpu_socket_energy( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) penergy = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_energy( processor_handle, ctypes.byref(penergy) ) ) return f"{float(penergy.value * pow(10, -6))} J" def amdsmi_get_threads_per_core(): threads_per_core = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_threads_per_core( ctypes.byref(threads_per_core) ) ) return threads_per_core.value def amdsmi_get_cpu_prochot_status( processor_handle: processor_handle_t ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) prochot = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_prochot_status( processor_handle, ctypes.byref(prochot) ) ) return prochot.value def amdsmi_get_cpu_fclk_mclk( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) fclk = ctypes.c_uint32() mclk = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_fclk_mclk( processor_handle, ctypes.byref(fclk), ctypes.byref(mclk) ) ) return { "fclk": f"{fclk.value} MHz", "mclk": f"{mclk.value} MHz" } def amdsmi_get_cpu_cclk_limit( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) cclk = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_cclk_limit( processor_handle, ctypes.byref(cclk) ) ) return f"{cclk.value} MHz" def amdsmi_get_cpu_socket_current_active_freq_limit( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) amdsmi_wrapper.amdsmi_get_cpu_socket_current_active_freq_limit.argtypes = [amdsmi_wrapper.amdsmi_processor_handle, POINTER(ctypes.c_uint16), POINTER(ctypes.c_char_p * len(amdsmi_wrapper.amdsmi_hsmp_freqlimit_src_names))] freq = ctypes.c_uint16() src_type = (ctypes.c_char_p * len(amdsmi_wrapper.amdsmi_hsmp_freqlimit_src_names))() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_current_active_freq_limit( processor_handle, ctypes.byref(freq), src_type ) ) freq_src = [] for names in src_type: if names is not None: freq_src.append(names.decode('utf-8')) return { "freq": f"{freq.value} MHz", "freq_src": f"{freq_src}" } def amdsmi_get_cpu_socket_freq_range( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) freq_max = ctypes.c_uint16() freq_min = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_freq_range( processor_handle, ctypes.byref(freq_max), ctypes.byref(freq_min) ) ) return { "max_socket_freq": f"{freq_max.value} MHz", "min_socket_freq": f"{freq_min.value} MHz" } def amdsmi_get_cpu_core_current_freq_limit( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) freq = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_core_current_freq_limit( processor_handle, ctypes.byref(freq) ) ) return f"{freq.value} MHz" def amdsmi_get_cpu_socket_power( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ppower = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_power( processor_handle, ctypes.byref(ppower) ) ) return f"{ppower.value} mW" def amdsmi_get_cpu_socket_power_cap( processor_handle: processor_handle_t ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pcap = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_power_cap( processor_handle, ctypes.byref(pcap) ) ) # in mW return pcap.value def amdsmi_get_cpu_socket_power_cap_max( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pmax = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_power_cap_max( processor_handle, ctypes.byref(pmax) ) ) return f"{pmax.value} mW" def amdsmi_get_cpu_pwr_svi_telemetry_all_rails( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) power = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_pwr_svi_telemetry_all_rails( processor_handle, ctypes.byref(power) ) ) return f"{power.value} mW" def amdsmi_set_cpu_socket_power_cap( processor_handle: processor_handle_t, power_cap: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(power_cap, int): raise AmdSmiParameterException(power_cap, int) power_cap_32 = ctypes.c_uint32(power_cap) _check_res( amdsmi_wrapper.amdsmi_set_cpu_socket_power_cap( processor_handle, power_cap_32) ) def amdsmi_set_cpu_pwr_efficiency_mode( processor_handle: processor_handle_t, mode: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(mode, int): raise AmdSmiParameterException(mode, int) mode_8 = ctypes.c_uint8(mode) _check_res( amdsmi_wrapper.amdsmi_set_cpu_pwr_efficiency_mode( processor_handle, mode_8) ) def amdsmi_get_cpu_core_boostlimit( processor_handle: processor_handle_t ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) boostlimit = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_core_boostlimit( processor_handle, ctypes.byref(boostlimit) ) ) # In MHz" return boostlimit.value def amdsmi_get_cpu_socket_c0_residency( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) c0_residency = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_c0_residency( processor_handle, ctypes.byref(c0_residency) ) ) return f"{c0_residency.value} %" def amdsmi_set_cpu_core_boostlimit( processor_handle: processor_handle_t, boostlimit: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(boostlimit, int): raise AmdSmiParameterException(boostlimit, int) boostlimit_32 = ctypes.c_uint32(boostlimit) _check_res( amdsmi_wrapper.amdsmi_set_cpu_core_boostlimit( processor_handle, boostlimit_32) ) def amdsmi_set_cpu_socket_boostlimit( processor_handle: processor_handle_t, boostlimit: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(boostlimit, int): raise AmdSmiParameterException(boostlimit, int) boostlimit_32 = ctypes.c_uint32(boostlimit) _check_res( amdsmi_wrapper.amdsmi_set_cpu_socket_boostlimit( processor_handle, boostlimit_32) ) def amdsmi_get_cpu_ddr_bw(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ddr_bw = amdsmi_wrapper.amdsmi_ddr_bw_metrics_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_ddr_bw(processor_handle, ddr_bw)) return { "ddr_bw_max_bw": f"{ddr_bw.max_bw} Gbps", "ddr_bw_utilized_bw": f"{ddr_bw.utilized_bw} Gbps", "ddr_bw_utilized_pct": f"{ddr_bw.utilized_pct} %" } def amdsmi_get_cpu_socket_temperature( processor_handle: processor_handle_t ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ptmon = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_temperature( processor_handle, ctypes.byref(ptmon) ) ) return f"{ptmon.value} Degrees C" def amdsmi_get_cpu_dimm_temp_range_and_refresh_rate( processor_handle: processor_handle_t, dimm_addr: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(dimm_addr, int): raise AmdSmiParameterException(dimm_addr, int) dimm_addr_8 = ctypes.c_uint8(dimm_addr) dimm = amdsmi_wrapper.amdsmi_temp_range_refresh_rate_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(processor_handle, dimm_addr_8, ctypes.byref(dimm))) return { "dimm_temperature_range": dimm.range, "dimm_refresh_rate": dimm.ref_rate } def amdsmi_get_cpu_dimm_power_consumption( processor_handle: processor_handle_t, dimm_addr: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(dimm_addr, int): raise AmdSmiParameterException(dimm_addr, int) dimm_addr_8 = ctypes.c_uint8(dimm_addr) dimm = amdsmi_wrapper.amdsmi_dimm_power_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_dimm_power_consumption(processor_handle, dimm_addr_8, ctypes.byref(dimm))) return { "dimm_power_consumed": f"{dimm.power} mW", "dimm_power_update_rate": f"{dimm.update_rate} ms", "dimm_dimm_addr": dimm.dimm_addr } def amdsmi_get_cpu_dimm_thermal_sensor( processor_handle: processor_handle_t, dimm_addr: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(dimm_addr, int): raise AmdSmiParameterException(dimm_addr, int) dimm_addr_8 = ctypes.c_uint8(dimm_addr) dimm_thermal = amdsmi_wrapper.amdsmi_dimm_thermal_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_dimm_thermal_sensor(processor_handle, dimm_addr_8, ctypes.byref(dimm_thermal))) return { "dimm_thermal_sensor_value": dimm_thermal.sensor, "dimm_thermal_update_rate": f"{dimm_thermal.update_rate} ms", "dimm_thermal_dimm_addr": dimm_thermal.dimm_addr, "dimm_thermal_temperature": f"{dimm_thermal.temp} Degrees C" } def amdsmi_set_cpu_xgmi_width( processor_handle: processor_handle_t, min_width: int, max_width: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(min_width, int): raise AmdSmiParameterException(min_width, int) if not isinstance(max_width, int): raise AmdSmiParameterException(max_width, int) min_width_8 = ctypes.c_uint8(min_width) max_width_8 = ctypes.c_uint8(max_width) _check_res( amdsmi_wrapper.amdsmi_set_cpu_xgmi_width( processor_handle, min_width_8, max_width_8) ) def amdsmi_set_cpu_gmi3_link_width_range( processor_handle: processor_handle_t, min_link_width: int, max_link_width: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(min_link_width, int): raise AmdSmiParameterException(min_link_width, int) if not isinstance(max_link_width, int): raise AmdSmiParameterException(max_link_width, int) min_link_width_8 = ctypes.c_uint8(min_link_width) max_link_width_8 = ctypes.c_uint8(max_link_width) _check_res( amdsmi_wrapper.amdsmi_set_cpu_gmi3_link_width_range( processor_handle, min_link_width_8, max_link_width_8) ) def amdsmi_cpu_apb_enable( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res( amdsmi_wrapper.amdsmi_cpu_apb_enable(processor_handle) ) def amdsmi_cpu_apb_disable( processor_handle: processor_handle_t, pstate: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(pstate, int): raise AmdSmiParameterException(pstate, int) pstate_8 = ctypes.c_uint8(pstate) _check_res( amdsmi_wrapper.amdsmi_cpu_apb_disable( processor_handle, pstate_8) ) def amdsmi_set_cpu_socket_lclk_dpm_level( processor_handle: processor_handle_t, nbio_id: int, min_val: int, max_val: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(nbio_id, int): raise AmdSmiParameterException(nbio_id, int) if not isinstance(min_val, int): raise AmdSmiParameterException(min_val, int) if not isinstance(max_val, int): raise AmdSmiParameterException(max_val, int) nbio_id_8 = ctypes.c_uint8(nbio_id) min_val_8 = ctypes.c_uint8(min_val) max_val_8 = ctypes.c_uint8(max_val) _check_res( amdsmi_wrapper.amdsmi_set_cpu_socket_lclk_dpm_level( processor_handle, nbio_id_8, min_val_8, max_val_8) ) def amdsmi_get_cpu_socket_lclk_dpm_level( processor_handle: processor_handle_t, nbio_id: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(nbio_id, int): raise AmdSmiParameterException(nbio_id, int) nbio_id_8 = ctypes.c_uint8(nbio_id) dpm_level = amdsmi_wrapper.amdsmi_dpm_level_t() _check_res(amdsmi_wrapper.amdsmi_get_cpu_socket_lclk_dpm_level(processor_handle, nbio_id_8, dpm_level)) return { "nbio_max_dpm_level": dpm_level.max_dpm_level, "nbio_min_dpm_level": dpm_level.min_dpm_level } def amdsmi_set_cpu_pcie_link_rate( processor_handle: processor_handle_t, rate_ctrl: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(rate_ctrl, int): raise AmdSmiParameterException(rate_ctrl, int) rate_ctrl_8 = ctypes.c_uint8(rate_ctrl) prev_mode_8 = ctypes.c_uint8() _check_res( amdsmi_wrapper.amdsmi_set_cpu_pcie_link_rate( processor_handle, rate_ctrl_8, ctypes.byref(prev_mode_8)) ) return f"{prev_mode_8.value}" def amdsmi_set_cpu_df_pstate_range( processor_handle: processor_handle_t, max_pstate: int, min_pstate: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(max_pstate, int): raise AmdSmiParameterException(max_pstate, int) if not isinstance(min_pstate, int): raise AmdSmiParameterException(min_pstate, int) max_pstate_8 = ctypes.c_uint8(max_pstate) min_pstate_8 = ctypes.c_uint8(min_pstate) _check_res( amdsmi_wrapper.amdsmi_set_cpu_df_pstate_range( processor_handle, max_pstate_8, min_pstate_8)) def amdsmi_get_cpu_current_io_bandwidth( processor_handle: processor_handle_t, encoding: int, link_name: str ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(encoding, int): raise AmdSmiParameterException(encoding, int) if not isinstance(link_name, str): raise AmdSmiParameterException(link_name, str) link = amdsmi_wrapper.amdsmi_link_id_bw_type_t() link.bw_type = ctypes.c_uint32(encoding) link.link_name = ctypes.create_string_buffer(link_name.encode('utf-8')) io_bw = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_current_io_bandwidth( processor_handle, link, ctypes.byref(io_bw)) ) return f"{io_bw.value} Mbps" def amdsmi_get_cpu_current_xgmi_bw( processor_handle: processor_handle_t, encoding: int, link_name: str ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(encoding, int): raise AmdSmiParameterException(encoding, int) if not isinstance(link_name, str): raise AmdSmiParameterException(link_name, str) link = amdsmi_wrapper.amdsmi_link_id_bw_type_t() link.bw_type = ctypes.c_uint32(encoding) link.link_name = ctypes.create_string_buffer(link_name.encode('utf-8')) xgmi_bw = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_current_xgmi_bw( processor_handle, link, ctypes.byref(xgmi_bw)) ) return f"{xgmi_bw.value} Mbps" def amdsmi_get_hsmp_metrics_table_version( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) metric_tbl_version = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_hsmp_metrics_table_version( processor_handle, ctypes.byref(metric_tbl_version)) ) return metric_tbl_version.value def amdsmi_set_cpu_rail_isofreq_policy( processor_handle: processor_handle_t, value: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res( amdsmi_wrapper.amdsmi_set_cpu_rail_isofreq_policy(processor_handle, value) ) def amdsmi_get_cpu_rail_isofreq_policy( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) cpurailiso = ctypes.c_uint8() _check_res( amdsmi_wrapper.amdsmi_get_cpu_rail_isofreq_policy( processor_handle, ctypes.byref(cpurailiso) ) ) return cpurailiso.value def amdsmi_get_dfc_ctrl( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) dfc_ctrl = ctypes.c_uint8() _check_res( amdsmi_wrapper.amdsmi_get_dfc_ctrl( processor_handle, ctypes.byref(dfc_ctrl) ) ) return dfc_ctrl.value def amdsmi_set_dfc_ctrl( processor_handle: processor_handle_t, value: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res( amdsmi_wrapper.amdsmi_set_dfc_ctrl(processor_handle, value) ) # Get 2's complement of 32 bit unsigned integer def check_msb_32(num): msb = 1 << (NO_OF_32BITS - 1) # If msb = 1 , then take 2's complement of the number if num & msb: num = ~num + 1 return num # Get 2's complement of 64 bit unsigned integer def check_msb_64(num): msb = 1 << (NO_OF_64BITS - 1) # If msb = 1 , then take 2's complement of the number if num & msb: num = ~num + 1 return num def amdsmi_get_hsmp_metrics_table( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) mtbl = amdsmi_wrapper.amdsmi_hsmp_metrics_table_t() # Encodings for the metric table defined for hsmp fraction_q10 = 1 / math.pow(2, 10) fraction_uq10 = fraction_q10 fraction_uq16 = 1 / math.pow(2, 16) _check_res( amdsmi_wrapper.amdsmi_get_hsmp_metrics_table( processor_handle, mtbl ) ) rawtime = int(mtbl.timestamp) rawtime = time() timeinfo = localtime(rawtime) return { "mtbl_accumulation_counter": mtbl.accumulation_counter, "mtbl_max_socket_temperature": f"{round(check_msb_32(mtbl.max_socket_temperature) * fraction_q10 ,3)} °C", "mtbl_max_vr_temperature": f"{round(check_msb_32(mtbl.max_vr_temperature) * fraction_q10 ,3)} °C", "mtbl_max_hbm_temperature": f"{round(check_msb_32(mtbl.max_hbm_temperature) * fraction_q10 ,3)} °C", "mtbl_max_socket_temperature_acc": f"{round(check_msb_64(mtbl.max_socket_temperature_acc) * fraction_q10 ,3)} °C", "mtbl_max_vr_temperature_acc": f"{round(check_msb_64(mtbl.max_vr_temperature_acc) * fraction_q10 ,3)} °C", "mtbl_max_hbm_temperature_acc": f"{round(check_msb_64(mtbl.max_hbm_temperature_acc) * fraction_q10 ,3)} °C", "mtbl_socket_power_limit": f"{round(mtbl.socket_power_limit * fraction_uq10 ,3)} W", "mtbl_max_socket_power_limit": f"{round(mtbl.max_socket_power_limit * fraction_uq10 ,3)} W", "mtbl_socket_power": f"{round(mtbl.socket_power * fraction_uq10 ,3)} W", "mtbl_timestamp_raw": mtbl.timestamp, "mtbl_timestamp_readable": f"{asctime(timeinfo)}", "mtbl_socket_energy_acc": f"{round((mtbl.socket_energy_acc * fraction_uq16)/KILO ,3)} kJ", "mtbl_ccd_energy_acc": f"{round((mtbl.ccd_energy_acc * fraction_uq16)/KILO ,3)} kJ", "mtbl_xcd_energy_acc": f"{round((mtbl.xcd_energy_acc * fraction_uq16)/KILO ,3)} kJ", "mtbl_aid_energy_acc": f"{round((mtbl.aid_energy_acc * fraction_uq16)/KILO ,3)} kJ", "mtbl_hbm_energy_acc": f"{round((mtbl.hbm_energy_acc * fraction_uq16)/KILO ,3)} kJ", "mtbl_cclk_frequency_limit": f"{round(mtbl.cclk_frequency_limit * fraction_uq10 ,3)} GHz", "mtbl_gfxclk_frequency_limit": f"{round(mtbl.gfxclk_frequency_limit * fraction_uq10 ,3)} MHz", "mtbl_fclk_frequency": f"{round(mtbl.fclk_frequency * fraction_uq10 ,3)} MHz", "mtbl_uclk_frequency": f"{round(mtbl.uclk_frequency * fraction_uq10 ,3)} MHz", "mtbl_socclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.socclk_frequency)]} MHz", "mtbl_vclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.vclk_frequency)]} MHz", "mtbl_dclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.dclk_frequency)]} MHz", "mtbl_lclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.lclk_frequency)]} MHz", "mtbl_fclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.fclk_frequency_table)]} MHz", "mtbl_uclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.uclk_frequency_table)]} MHz", "mtbl_socclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.socclk_frequency_table)]} MHz", "mtbl_vclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.vclk_frequency_table)]} MHz", "mtbl_dclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.dclk_frequency_table)]} MHz", "mtbl_lclk_frequency_table": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.lclk_frequency_table)]} MHz", "mtbl_cclk_frequency_acc": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.cclk_frequency_acc)]} GHz", "mtbl_gfxclk_frequency_acc": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.gfxclk_frequency_acc)]} MHz", "mtbl_gfxclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.gfxclk_frequency)]} MHz", "mtbl_max_cclk_frequency": f"{round(mtbl.max_cclk_frequency * fraction_uq10 ,3)} GHz", "mtbl_min_cclk_frequency": f"{round(mtbl.min_cclk_frequency * fraction_uq10 ,3)} GHz", "mtbl_max_gfxclk_frequency": f"{round(mtbl.max_gfxclk_frequency * fraction_uq10 ,3)} MHz", "mtbl_min_gfxclk_frequency": f"{round(mtbl.min_gfxclk_frequency * fraction_uq10 ,3)} MHz", "mtbl_max_lclk_dpm_range": mtbl.max_lclk_dpm_range, "mtbl_min_lclk_dpm_range": mtbl.min_lclk_dpm_range, "mtbl_xgmi_width": round(mtbl.xgmi_width * fraction_uq10 ,3), "mtbl_xgmi_bitrate": f"{round(mtbl.xgmi_bitrate * fraction_uq10 ,3)} Gbps", "mtbl_xgmi_read_bandwidth_acc": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.xgmi_read_bandwidth_acc)]} Gbps", "mtbl_xgmi_write_bandwidth_acc": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.xgmi_write_bandwidth_acc)]} Gbps", "mtbl_socket_c0_residency": f"{round(mtbl.socket_c0_residency * fraction_uq10 ,3)} %", "mtbl_socket_gfx_busy": f"{round(mtbl.socket_gfx_busy * fraction_uq10 ,3)} %", "mtbl_hbm_bandwidth_utilization": f"{round(mtbl.dram_bandwidth_utilization * fraction_uq10 ,3)} %", "mtbl_socket_c0_residency_acc": round(mtbl.socket_c0_residency_acc * fraction_uq10 ,3), "mtbl_socket_gfx_busy_acc": round(mtbl.socket_gfx_busy_acc * fraction_uq10 ,3), "mtbl_hbm_bandwidth_acc": f"{round(mtbl.dram_bandwidth_acc * fraction_uq10 ,3)} Gbps", "mtbl_max_hbm_bandwidth": f"{round(mtbl.max_dram_bandwidth * fraction_uq10 ,3)} Gbps", "mtbl_dram_bandwidth_utilization_acc": round(mtbl.dram_bandwidth_utilization_acc * fraction_uq10 ,3), "mtbl_pcie_bandwidth_acc": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.pcie_bandwidth_acc)]} Gbps", "mtbl_prochot_residency_acc": mtbl.prochot_residency_acc, "mtbl_ppt_residency_acc": mtbl.ppt_residency_acc, "mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc, "mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc, "mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc, } def amdsmi_first_online_core_on_cpu_socket( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pcore_ind = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_first_online_core_on_cpu_socket( processor_handle, ctypes.byref(pcore_ind)) ) return pcore_ind.value def amdsmi_get_cpu_family(): family = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_family(ctypes.byref(family)) ) return family.value def amdsmi_get_cpu_model(): model = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_model(ctypes.byref(model)) ) return model.value def amdsmi_get_cpu_model_name( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) cpu_info = amdsmi_wrapper.amdsmi_cpu_info_t() _check_res( amdsmi_wrapper.amdsmi_get_cpu_model_name( processor_handle, cpu_info ) ) return f"{cpu_info.model_name}" def amdsmi_get_cpu_cores_per_socket(sock_count: ctypes.c_uint32): cps = amdsmi_wrapper.amdsmi_sock_info_t() _check_res( amdsmi_wrapper.amdsmi_get_cpu_cores_per_socket(sock_count, cps) ) return {"socket_id": cps.socket_id, "cores_per_socket": cps.cores_per_socket } def amdsmi_get_cpu_socket_count(): sock_count = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_cpu_socket_count(ctypes.byref(sock_count)) ) return sock_count.value def amdsmi_init(flag=AmdSmiInitFlags.INIT_AMD_GPUS): if not isinstance(flag, AmdSmiInitFlags): raise AmdSmiParameterException(flag, AmdSmiInitFlags) _check_res(amdsmi_wrapper.amdsmi_init(flag)) def amdsmi_shut_down(): _check_res(amdsmi_wrapper.amdsmi_shut_down()) def amdsmi_get_processor_type( processor_handle: processor_handle_t, ) -> Dict[str, str]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) dev_type = amdsmi_wrapper.processor_type_t() _check_res( amdsmi_wrapper.amdsmi_get_processor_type( processor_handle, ctypes.byref(dev_type)) ) return { "processor_type": AmdSmiProcessorType(dev_type.value).name } def amdsmi_get_gpu_device_bdf(processor_handle: processor_handle_t) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) bdf_info = amdsmi_wrapper.amdsmi_bdf_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_device_bdf( processor_handle, ctypes.byref(bdf_info)) ) return _format_bdf(bdf_info) def amdsmi_get_gpu_device_uuid(processor_handle: processor_handle_t) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) uuid = ctypes.create_string_buffer(AMDSMI_GPU_UUID_SIZE) uuid_length = ctypes.c_uint32() uuid_length.value = AMDSMI_GPU_UUID_SIZE _check_res( amdsmi_wrapper.amdsmi_get_gpu_device_uuid( processor_handle, ctypes.byref(uuid_length), uuid ) ) return uuid.value.decode("utf-8") def amdsmi_get_gpu_enumeration_info(processor_handle: processor_handle_t) -> Dict[str, Any]: """ Retrieves GPU enumeration information including DRM card ID, DRM render ID, HIP ID, and HIP UUID. Parameters: processor_handle (amdsmi_processor_handle_t): The processor handle. Returns: Dict[str, Any]: A dictionary containing the retrieved enumeration information. Raises: AmdSmiParameterException: If the input parameters are invalid. """ # Validate the processor handle if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) # Create an instance of the enumeration info struct enumeration_info = amdsmi_wrapper.amdsmi_enumeration_info_t() # Call the C function to populate the struct status = amdsmi_wrapper.amdsmi_get_gpu_enumeration_info(processor_handle, ctypes.byref(enumeration_info)) # Validate the status result _check_res(status) # Convert the struct fields into a dictionary and return enumeration_info = { "drm_render": _validate_if_max_uint(enumeration_info.drm_render, MaxUIntegerTypes.UINT32_T), "drm_card": _validate_if_max_uint(enumeration_info.drm_card, MaxUIntegerTypes.UINT32_T), "hsa_id": _validate_if_max_uint(enumeration_info.hsa_id, MaxUIntegerTypes.UINT32_T), "hip_id": _validate_if_max_uint(enumeration_info.hip_id, MaxUIntegerTypes.UINT32_T), "hip_uuid": enumeration_info.hip_uuid.decode('utf-8') } return enumeration_info def amdsmi_get_cpu_affinity_with_scope( processor_handle: processor_handle_t, scope: AmdSmiAffinityScope ) -> List[int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(scope, AmdSmiAffinityScope): raise AmdSmiParameterException(scope, AmdSmiAffinityScope) socket_count = amdsmi_get_cpu_socket_count() sock_info = amdsmi_get_cpu_cores_per_socket(socket_count) core_count = sock_info['cores_per_socket'] size = ctypes.c_uint32(0) size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8) size = int(math.ceil(size)) size = ctypes.c_uint32(size) cpu_set = (ctypes.c_uint64 * size.value)() _check_res( amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope( processor_handle, size, cpu_set, scope) ) return cpu_set def amdsmi_get_gpu_asic_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) asic_info_struct = amdsmi_wrapper.amdsmi_asic_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_asic_info( processor_handle, ctypes.byref(asic_info_struct)) ) market_name = _pad_hex_value(asic_info_struct.market_name.decode("utf-8"), 4) target_graphics_version = hex(asic_info_struct.target_graphics_version)[2:] subsystem_id = _validate_if_max_uint(asic_info_struct.subsystem_id, MaxUIntegerTypes.UINT32_T) subvendor_id = _validate_if_max_uint(asic_info_struct.subvendor_id, MaxUIntegerTypes.UINT32_T) if isinstance(subsystem_id, int): subsystem_id = _pad_hex_value(hex(subsystem_id), 4) if isinstance(subvendor_id, int): subvendor_id = _pad_hex_value(hex(subvendor_id), 4) asic_info = { "market_name": market_name, "vendor_id": asic_info_struct.vendor_id, "vendor_name": asic_info_struct.vendor_name.decode("utf-8"), "subvendor_id": subvendor_id, "device_id": asic_info_struct.device_id, "rev_id": _pad_hex_value(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": _validate_if_max_uint(asic_info_struct.oam_id, MaxUIntegerTypes.UINT32_T), "num_compute_units": _validate_if_max_uint(asic_info_struct.num_of_compute_units, MaxUIntegerTypes.UINT32_T), "target_graphics_version": "gfx" + target_graphics_version, "subsystem_id": subsystem_id, "flags": asic_info_struct.flags } string_values = ["market_name", "vendor_name"] for value in string_values: if not asic_info[value]: asic_info[value] = "N/A" hex_values = ["vendor_id", "device_id"] for value in hex_values: if asic_info[value]: asic_info[value] = hex(asic_info[value]) else: asic_info[value] = "N/A" # Convert asic serial (hex string) to hex output format if asic_info["asic_serial"]: asic_serial_string = asic_info["asic_serial"] asic_serial_hex = int(asic_serial_string, base=16) asic_info["asic_serial"] = str.format("0x{:016X}", asic_serial_hex) else: asic_info["asic_serial"] = "N/A" # Remove commas from vendor name for clean output asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '') return asic_info def amdsmi_get_gpu_kfd_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) kfd_info_struct = amdsmi_wrapper.amdsmi_kfd_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_kfd_info( processor_handle, ctypes.byref(kfd_info_struct)) ) kfd_info = { "kfd_id": _validate_if_max_uint(kfd_info_struct.kfd_id, MaxUIntegerTypes.UINT64_T), "node_id": _validate_if_max_uint(kfd_info_struct.node_id, MaxUIntegerTypes.UINT32_T), "current_partition_id": _validate_if_max_uint(kfd_info_struct.current_partition_id, MaxUIntegerTypes.UINT32_T) } return kfd_info def amdsmi_get_supported_power_cap( processor_handle: processor_handle_t) ->Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) CONST_AMDSMI_MAX_POWER_SENSORS = 2 sensor_count = ctypes.c_uint32() sensor_ind = (ctypes.c_uint32 * CONST_AMDSMI_MAX_POWER_SENSORS)() sensor_types = (amdsmi_wrapper.amdsmi_power_cap_type_t * CONST_AMDSMI_MAX_POWER_SENSORS)() _check_res( amdsmi_wrapper.amdsmi_get_supported_power_cap( processor_handle, ctypes.byref(sensor_count), sensor_ind, sensor_types ) ) return { "sensor_inds": [sensor_ind[i] for i in range(sensor_count.value)], "sensor_types": [AmdSmiPowerCapType(sensor_types[i]) for i in range(sensor_count.value)] } def amdsmi_get_power_cap_info( processor_handle: processor_handle_t, sensor_ind: int = AmdSmiPowerCapType.PPT0 ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) power_cap_info = amdsmi_wrapper.amdsmi_power_cap_info_t() _check_res( amdsmi_wrapper.amdsmi_get_power_cap_info( processor_handle, sensor_ind, ctypes.byref(power_cap_info) ) ) return {"power_cap": power_cap_info.power_cap, "default_power_cap": power_cap_info.default_power_cap, "dpm_cap": power_cap_info.dpm_cap, "min_power_cap": power_cap_info.min_power_cap, "max_power_cap": power_cap_info.max_power_cap} def _get_name_value(num, data) -> List[Dict[str, int]]: """ Extracts a list of name-value pairs from a ctypes array buffer. This function works around a ctypes array issue where direct field access to the `amdsmi_name_value_t` structure is unreliable. Instead, it uses memory operations to extract the 'name' (a 64-byte char array) and 'value' (a uint64) from each structure in the array. Parameters: num (ctypes.c_uint32): Number of elements in the array. data (ctypes.c_void_p): Pointer to the start of the array buffer containing `amdsmi_name_value_t` structures. Returns: List[Dict[str, int]]: A list of dictionaries, each with keys 'name' (str) and 'value' (int) extracted from the buffer. Workaround: Direct access to the fields of the ctypes array is broken, so the function uses memory alignment and pointer arithmetic to extract the fields manually. """ # Work around ctypes array issue by using memory access # Use 4 byte alignment for amdsmi_name_value_t.name char array, 64=256/4 # Use 8 bytes for amdsmi_name_value_t.value uint64 aligned_name_size = int(AMDSMI_MAX_STRING_LENGTH / 4) value_size_bytes = 8 struct_alignment = aligned_name_size + value_size_bytes # Access name,value field using memory operations since direct access is broken struct_ptr = ctypes.cast(data, ctypes.POINTER(ctypes.c_char * struct_alignment)) results = [] for i in range(num.value): # Offset into structure array current_struct = struct_ptr[i] # Cast address for name member with max chars to read name_ptr = ctypes.cast(ctypes.addressof(current_struct), ctypes.POINTER(ctypes.c_char * AMDSMI_MAX_STRING_LENGTH)) # Data buffer in bytes name_bytes = ctypes.string_at(name_ptr.contents) # Get string name_str = name_bytes.rstrip(b'\x00').decode('utf-8', errors='replace') # Address for value member addr_value = ctypes.addressof(current_struct) + struct_alignment # Cast data buffer to a uint64 int64_ptr = ctypes.cast(addr_value, ctypes.POINTER(ctypes.c_uint64)) # Get value value = int64_ptr.contents.value item = { 'name': name_str, 'value': value } results.append(item) return results def amdsmi_get_gpu_pm_metrics_info( processor_handle: processor_handle_t, ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pm_metrics = POINTER(amdsmi_wrapper.amdsmi_name_value_t)() num_mets = ctypes.c_uint32(0) _check_res( amdsmi_wrapper.amdsmi_get_gpu_pm_metrics_info( processor_handle, ctypes.byref(pm_metrics), ctypes.byref(num_mets) ) ) results = _get_name_value(num_mets, pm_metrics) # Free the allocated memory amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) return results def amdsmi_get_gpu_reg_table_info( processor_handle: processor_handle_t, reg_type: AmdSmiRegType ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(reg_type, AmdSmiRegType): raise AmdSmiParameterException(reg_type, AmdSmiRegType) reg_metrics = POINTER(amdsmi_wrapper.amdsmi_name_value_t)() num_regs = ctypes.c_uint32(0) _check_res( amdsmi_wrapper.amdsmi_get_gpu_reg_table_info( processor_handle, reg_type, ctypes.byref(reg_metrics), ctypes.byref(num_regs) ) ) results = _get_name_value(num_regs, reg_metrics) # Free the allocated memory amdsmi_wrapper.amdsmi_free_name_value_pairs(reg_metrics) return results def amdsmi_get_gpu_vram_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) vram_info = amdsmi_wrapper.amdsmi_vram_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_vram_info( processor_handle, ctypes.byref(vram_info)) ) return { "vram_type": vram_info.vram_type, "vram_vendor": vram_info.vram_vendor.decode("utf-8"), "vram_size": vram_info.vram_size, "vram_bit_width": _validate_if_max_uint(vram_info.vram_bit_width, MaxUIntegerTypes.UINT32_T), "vram_max_bandwidth": _validate_if_max_uint(vram_info.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), } def amdsmi_get_gpu_xgmi_link_status( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) status_info = amdsmi_wrapper.amdsmi_xgmi_link_status_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_xgmi_link_status( processor_handle, ctypes.byref(status_info)) ) link_status = [] count = 0 for link in status_info.status: if count == status_info.total_links: break if amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DISABLE': # XGMI link is disabled link_status.append("X") elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_UP': # XGMI Link is up link_status.append("U") elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DOWN': # XGMI Link is down link_status.append("D") else: link_status.append("N/A") count += 1 return_dict = { "status" : link_status, "total_links": status_info.total_links, } return return_dict def amdsmi_get_gpu_cache_info( processor_handle: processor_handle_t, ) -> Dict[str, List]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) cache_info_struct = amdsmi_wrapper.amdsmi_gpu_cache_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_cache_info( processor_handle, ctypes.byref(cache_info_struct)) ) cache_info_list = [] for cache_index in range(cache_info_struct.num_cache_types): # Put cache_properties at the start of the dictionary for readability cache_dict = { "cache_properties": [], # This will be a list of strings "cache_size": cache_info_struct.cache[cache_index].cache_size, "cache_level": cache_info_struct.cache[cache_index].cache_level, "max_num_cu_shared": cache_info_struct.cache[cache_index].max_num_cu_shared, "num_cache_instance": cache_info_struct.cache[cache_index].num_cache_instance } # Check against cache properties bitmask cache_properties = cache_info_struct.cache[cache_index].cache_properties data_cache = cache_properties & amdsmi_wrapper.AMDSMI_CACHE_PROPERTY_DATA_CACHE inst_cache = cache_properties & amdsmi_wrapper.AMDSMI_CACHE_PROPERTY_INST_CACHE cpu_cache = cache_properties & amdsmi_wrapper.AMDSMI_CACHE_PROPERTY_CPU_CACHE simd_cache = cache_properties & amdsmi_wrapper.AMDSMI_CACHE_PROPERTY_SIMD_CACHE cache_properties_status = [data_cache, inst_cache, cpu_cache, simd_cache] cache_property_list = [] for cache_property in cache_properties_status: if cache_property: property_name = amdsmi_wrapper.amdsmi_cache_property_type_t__enumvalues[cache_property] property_name = property_name.replace("AMDSMI_CACHE_PROPERTY_", "") cache_property_list.append(property_name) cache_dict["cache_properties"] = cache_property_list cache_info_list.append(cache_dict) if not cache_info_list: raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA) return { "cache": cache_info_list } def amdsmi_get_gpu_vbios_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) vbios_info = amdsmi_wrapper.amdsmi_vbios_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_vbios_info( processor_handle, ctypes.byref(vbios_info)) ) boot_firmware = vbios_info.boot_firmware.decode("utf-8") if boot_firmware == "": boot_firmware = "N/A" return { "name": vbios_info.name.decode("utf-8"), "build_date": vbios_info.build_date.decode("utf-8"), "part_number": vbios_info.part_number.decode("utf-8"), "version": vbios_info.version.decode("utf-8"), "boot_firmware": boot_firmware, } def amdsmi_get_gpu_activity( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) engine_usage = amdsmi_wrapper.amdsmi_engine_usage_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_activity( processor_handle, ctypes.byref(engine_usage) ) ) activity_dict = { "gfx_activity": engine_usage.gfx_activity, "umc_activity": engine_usage.umc_activity, "mm_activity": engine_usage.mm_activity, } for key, value in activity_dict.items(): if value == 0xFFFF: activity_dict[key] = "N/A" return activity_dict def amdsmi_get_clock_info( processor_handle: processor_handle_t, clock_type: AmdSmiClkType, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(clock_type, AmdSmiClkType): raise AmdSmiParameterException(clock_type, AmdSmiClkType) clock_measure = amdsmi_wrapper.amdsmi_clk_info_t() _check_res( amdsmi_wrapper.amdsmi_get_clock_info( processor_handle, clock_type, ctypes.byref(clock_measure), ) ) dict_ret = { "clk": _validate_if_max_uint(clock_measure.clk, MaxUIntegerTypes.UINT32_T), "min_clk": _validate_if_max_uint(clock_measure.min_clk, MaxUIntegerTypes.UINT32_T), "max_clk": _validate_if_max_uint(clock_measure.max_clk, MaxUIntegerTypes.UINT32_T), "clk_locked": _validate_if_max_uint(clock_measure.clk_locked, MaxUIntegerTypes.UINT8_T, isBool=True), "clk_deep_sleep" : _validate_if_max_uint(clock_measure.clk_deep_sleep, MaxUIntegerTypes.UINT8_T), } return dict_ret def amdsmi_get_gpu_bad_page_info( processor_handle: processor_handle_t, ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) num_pages = ctypes.c_uint32() nullptr = POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( processor_handle, ctypes.byref(num_pages), nullptr ) ) if num_pages.value == 0: return [] bad_pages_array_type = amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value bad_pages = bad_pages_array_type() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( processor_handle, ctypes.byref(num_pages), bad_pages ) ) return _format_bad_page_info(bad_pages, num_pages) def amdsmi_get_gpu_bad_page_threshold( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) threshold = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bad_page_threshold( processor_handle, ctypes.byref(threshold) ) ) return threshold.value def amdsmi_get_violation_status( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) violation_status = amdsmi_wrapper.amdsmi_violation_status_t() _check_res( amdsmi_wrapper.amdsmi_get_violation_status( processor_handle, ctypes.byref(violation_status)) ) dict_return = { "reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T), "violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T), "acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T), "acc_prochot_thrm": _validate_if_max_uint(violation_status.acc_prochot_thrm, MaxUIntegerTypes.UINT64_T), "acc_ppt_pwr": _validate_if_max_uint(violation_status.acc_ppt_pwr, MaxUIntegerTypes.UINT64_T), #PVIOL "acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL "acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T), "acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T), "acc_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.acc_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T), "acc_gfx_clk_below_host_limit_pwr": list(violation_status.acc_gfx_clk_below_host_limit_pwr), "acc_gfx_clk_below_host_limit_thm": list(violation_status.acc_gfx_clk_below_host_limit_thm), "acc_gfx_clk_below_host_limit_total": list(violation_status.acc_gfx_clk_below_host_limit_total), "acc_low_utilization": list(violation_status.acc_low_utilization), "per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL "per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL "per_vr_thrm": _validate_if_max_uint(violation_status.per_vr_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_hbm_thrm": _validate_if_max_uint(violation_status.per_hbm_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.per_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_gfx_clk_below_host_limit_pwr": list(violation_status.per_gfx_clk_below_host_limit_pwr), "per_gfx_clk_below_host_limit_thm": list(violation_status.per_gfx_clk_below_host_limit_thm), "per_gfx_clk_below_host_limit_total": list(violation_status.per_gfx_clk_below_host_limit_total), "per_low_utilization": list(violation_status.per_low_utilization), "active_prochot_thrm": _validate_if_max_uint(violation_status.active_prochot_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_ppt_pwr": _validate_if_max_uint(violation_status.active_ppt_pwr, MaxUIntegerTypes.UINT8_T, isBool=True), #PVIOL "active_socket_thrm": _validate_if_max_uint(violation_status.active_socket_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), #TVIOL "active_vr_thrm": _validate_if_max_uint(violation_status.active_vr_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.active_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT8_T, isBool=True), "active_gfx_clk_below_host_limit_pwr": list(violation_status.active_gfx_clk_below_host_limit_pwr), "active_gfx_clk_below_host_limit_thm": list(violation_status.active_gfx_clk_below_host_limit_thm), "active_gfx_clk_below_host_limit_total": list(violation_status.active_gfx_clk_below_host_limit_total), "active_low_utilization": list(violation_status.active_low_utilization), } # Create 2d array with each XCD's stats if 'acc_gfx_clk_below_host_limit_pwr' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_pwr']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail if 'acc_gfx_clk_below_host_limit_thm' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_thm']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail if 'acc_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_low_utilization']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_low_utilization'][xcp_index] = xcp_detail if 'acc_gfx_clk_below_host_limit_total' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_total']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_gfx_clk_below_host_limit_total'][xcp_index] = xcp_detail if 'per_gfx_clk_below_host_limit_pwr' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['per_gfx_clk_below_host_limit_pwr']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) dict_return['per_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail if 'per_gfx_clk_below_host_limit_thm' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['per_gfx_clk_below_host_limit_thm']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) dict_return['per_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail if 'per_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['per_low_utilization']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) dict_return['per_low_utilization'][xcp_index] = xcp_detail if 'per_gfx_clk_below_host_limit_total' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['per_gfx_clk_below_host_limit_total']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) dict_return['per_gfx_clk_below_host_limit_total'][xcp_index] = xcp_detail if 'active_gfx_clk_below_host_limit_pwr' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['active_gfx_clk_below_host_limit_pwr']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) dict_return['active_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail if 'active_gfx_clk_below_host_limit_thm' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['active_gfx_clk_below_host_limit_thm']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) dict_return['active_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail if 'active_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['active_low_utilization']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) dict_return['active_low_utilization'][xcp_index] = xcp_detail if 'active_gfx_clk_below_host_limit_total' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['active_gfx_clk_below_host_limit_total']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) dict_return['active_gfx_clk_below_host_limit_total'][xcp_index] = xcp_detail return dict_return def amdsmi_get_gpu_total_ecc_count( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ec = amdsmi_wrapper.amdsmi_error_count_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_total_ecc_count( processor_handle, ctypes.byref(ec) ) ) return { "correctable_count": ec.correctable_count, "uncorrectable_count": ec.uncorrectable_count, "deferred_count": ec.deferred_count, } def amdsmi_get_gpu_cper_entries( processor_handle: processor_handle_t, severity_mask: int, buffer_size: int = 4 * 1048576, cursor: int = 0 ) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]], int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(severity_mask, int): raise AmdSmiParameterException(severity_mask, int) if not isinstance(buffer_size, int): raise AmdSmiParameterException(buffer_size, int) if not isinstance(cursor, int): raise AmdSmiParameterException(cursor, int) # Allocate a buffer for CPER data. buf = ctypes.create_string_buffer(buffer_size) buf_size = ctypes.c_uint64(buffer_size) num_cper_hdrs = 20 entry_count = ctypes.c_uint64(num_cper_hdrs) cur = ctypes.c_uint64(cursor) # Allocate a pointer for the CPER header array. cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * num_cper_hdrs)() cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))) # Call the underlying AMD-SMI API. status_code = amdsmi_wrapper.amdsmi_get_gpu_cper_entries( processor_handle, ctypes.c_uint32(severity_mask), buf, ctypes.byref(buf_size), cper_hdrs, ctypes.byref(entry_count), ctypes.byref(cur) ) if status_code not in {amdsmi_wrapper.AMDSMI_STATUS_SUCCESS, amdsmi_wrapper.AMDSMI_STATUS_MORE_DATA}: raise AmdSmiLibraryException(status_code) entries = {} cper_data = [] offset = 0 # Iterate over each entry using its variable record_length. for i in range(entry_count.value): entry_address = ctypes.addressof(buf) + offset entry_ptr = ctypes.cast(entry_address, POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)) # Extract the raw bytes and size of the entry. cper_data.append({ "bytes": list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)), "size": entry_ptr.contents.record_length }) # Extract the timestamp fields. year = entry_ptr.contents.timestamp.year if year < 100: # Adjust the year if it's less than 100. year += 2000 formatted_timestamp = ( f"{year:04d}/" f"{entry_ptr.contents.timestamp.month:02d}/" f"{entry_ptr.contents.timestamp.day:02d} " f"{entry_ptr.contents.timestamp.hours:02d}:" f"{entry_ptr.contents.timestamp.minutes:02d}:" f"{entry_ptr.contents.timestamp.seconds:02d}" ) serial_number = "" if isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): try: board_info = amdsmi_get_gpu_board_info(processor_handle) serial_number = board_info.get('product_serial', "") except Exception: serial_number = "" # Create a dictionary for the CPER entry. cper_entry = { "error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get( entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED" ).replace("AMDSMI_CPER_SEV_", "").lower(), "notify_type": _notifyTypeToString(entry_ptr.contents.notify_type.b), "timestamp": formatted_timestamp, "signature": entry_ptr.contents.signature, "revision": entry_ptr.contents.revision, "signature_end": hex(entry_ptr.contents.signature_end), "sec_cnt": entry_ptr.contents.sec_cnt, "record_length": entry_ptr.contents.record_length, "serial_number": serial_number, "platform_id": entry_ptr.contents.platform_id, "creator_id": entry_ptr.contents.creator_id, "record_id": entry_ptr.contents.record_id, "flags": entry_ptr.contents.flags, "persistence_info": entry_ptr.contents.persistence_info, #"reserved" : entry_ptr.contents.reserved #"cper_valid_bit" : entry_ptr.contents.cper_valid_bits, #"partition_id" : entry_ptr.contents.partition_id, } entries[i] = cper_entry.copy() offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset. return entries, cur.value, cper_data, status_code def amdsmi_get_afids_from_cper( cper_afid_data: bytes ) -> Tuple[List[int], int]: """ Extract AFIDs from a CPER blob. Args: cper_afid_data: raw bytes of a single CPER record. Returns: Tuple[List[int], int]: A tuple containing: - A list of extracted AFIDs. - The total count of AFIDs. """ cper_records = [] # Normalize single blob into a list of records if isinstance(cper_afid_data, bytes): cper_records = [{ "bytes": list(cper_afid_data), "size": len(cper_afid_data) }] elif isinstance(cper_afid_data, List[Dict[str, Any]]): cper_records = cper_afid_data else: raise AmdSmiParameterException(cper_afid_data, bytes) all_afids: List[int] = [] for record in cper_records: if isinstance(record, dict) and "bytes" in record and "size" in record: raw_bytes = bytes(record["bytes"]) record_size = record["size"] else: raise AmdSmiParameterException(record, "dict with keys 'bytes' and 'size' or bytes/bytearray") # Wrap as char* buf = ctypes.create_string_buffer(raw_bytes, record_size) buf_ptr = ctypes.cast(buf, POINTER(ctypes.c_char)) afid_array = (ctypes.c_uint64 * MAX_NUMBER_OF_AFIDS_PER_RECORD)() num_afids_ct = ctypes.c_uint32(MAX_NUMBER_OF_AFIDS_PER_RECORD) # Call the wrapper function status = amdsmi_wrapper.amdsmi_get_afids_from_cper( buf_ptr, ctypes.c_uint32(record_size), afid_array, ctypes.byref(num_afids_ct) ) if status != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: raise AmdSmiLibraryException(status) # Collect exactly the decoded AFIDs count = num_afids_ct.value all_afids.extend(afid_array[i] for i in range(count)) return all_afids, len(all_afids) def amdsmi_get_gpu_board_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) board_info = amdsmi_wrapper.amdsmi_board_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_board_info( processor_handle, ctypes.byref(board_info)) ) board_info_dict = { "model_number": _pad_hex_value(board_info.model_number.decode("utf-8").strip(), 4), "product_serial": board_info.product_serial.decode("utf-8").strip(), "fru_id": board_info.fru_id.decode("utf-8").strip(), "product_name": _pad_hex_value(board_info.product_name.decode("utf-8").strip(), 4), "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip() } for key, value in board_info_dict.items(): if value == "": board_info_dict[key] = "N/A" return board_info_dict def amdsmi_get_gpu_ras_feature_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ras_feature = amdsmi_wrapper.amdsmi_ras_feature_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_ras_feature_info( processor_handle, ctypes.byref(ras_feature) ) ) return { "eeprom_version": hex(ras_feature.ras_eeprom_version), "parity_schema" : bool(ras_feature.ecc_correction_schema_flag & 1), "single_bit_schema" : bool(ras_feature.ecc_correction_schema_flag & 2), "double_bit_schema" : bool(ras_feature.ecc_correction_schema_flag & 4), "poison_schema" : bool(ras_feature.ecc_correction_schema_flag & 8) } def amdsmi_get_gpu_ras_block_features_enabled( processor_handle: processor_handle_t, ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) ras_state = amdsmi_wrapper.amdsmi_ras_err_state_t() ras_states = [] for gpu_block in AmdSmiGpuBlock: if gpu_block.name == "RESERVED" or gpu_block.name == "INVALID": continue gpu_block_name = gpu_block.name if gpu_block.name == "LAST": gpu_block_name = "MPIO" _check_res( amdsmi_wrapper.amdsmi_get_gpu_ras_block_features_enabled( processor_handle, amdsmi_wrapper.amdsmi_gpu_block_t(gpu_block.value), ctypes.byref(ras_state), ) ) ras_states.append( { "block": gpu_block_name, "status": AmdSmiRasErrState(ras_state.value).name, } ) return ras_states def amdsmi_get_gpu_process_list( processor_handle: processor_handle_t, ) -> List[amdsmi_wrapper.amdsmi_proc_info_t]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) # This will get populated with the number of processes found max_processes = ctypes.c_uint32(MAX_NUM_PROCESSES) process_list = (amdsmi_wrapper.amdsmi_proc_info_t * max_processes.value)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_process_list( processor_handle, ctypes.byref(max_processes), process_list ) ) result = [] for index in range(max_processes.value): process_name = process_list[index].name.decode("utf-8").strip() if process_name == "": process_name = "N/A" result.append({ "name": process_name, "pid": process_list[index].pid, "mem": process_list[index].mem, "engine_usage": { "gfx": process_list[index].engine_usage.gfx, "enc": process_list[index].engine_usage.enc }, "memory_usage": { "gtt_mem": process_list[index].memory_usage.gtt_mem, "cpu_mem": process_list[index].memory_usage.cpu_mem, "vram_mem": process_list[index].memory_usage.vram_mem, }, "cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T), "evicted_time": _validate_if_max_uint(process_list[index].evicted_time, MaxUIntegerTypes.UINT32_T) }) return result def amdsmi_get_gpu_driver_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) info = amdsmi_wrapper.amdsmi_driver_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_driver_info( processor_handle, ctypes.byref(info) ) ) # Not including os_kernel_version here due to it just being os.uname().release driver_info = { "driver_name": info.driver_name.decode("utf-8"), "driver_version": info.driver_version.decode("utf-8"), "driver_date": info.driver_date.decode("utf-8") } for key, value in driver_info.items(): if value == "": driver_info[key] = "N/A" return driver_info def amdsmi_get_power_info( processor_handle: processor_handle_t ) -> Dict[str, ctypes.c_uint32]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) power_info = amdsmi_wrapper.amdsmi_power_info_t() _check_res( amdsmi_wrapper.amdsmi_get_power_info( processor_handle, ctypes.byref(power_info) ) ) power_info_dict = { "socket_power": power_info.socket_power, "current_socket_power": power_info.current_socket_power, "average_socket_power": power_info.average_socket_power, "gfx_voltage": power_info.gfx_voltage, "soc_voltage": power_info.soc_voltage, "mem_voltage": power_info.mem_voltage, "power_limit" : power_info.power_limit, } for key, value in power_info_dict.items(): if value in (MaxUIntegerTypes.UINT8_T, MaxUIntegerTypes.UINT16_T, MaxUIntegerTypes.UINT32_T, MaxUIntegerTypes.UINT64_T): power_info_dict[key] = "N/A" return power_info_dict def amdsmi_is_gpu_power_management_enabled( processor_handle: processor_handle_t ) -> bool: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) is_power_management_enabled = ctypes.c_bool() _check_res( amdsmi_wrapper.amdsmi_is_gpu_power_management_enabled( processor_handle, ctypes.byref(is_power_management_enabled) ) ) return is_power_management_enabled.value def amdsmi_get_fw_info( processor_handle: processor_handle_t ) -> Dict[str, List[Dict[str, str]]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle) fw_info = amdsmi_wrapper.amdsmi_fw_info_t() _check_res( amdsmi_wrapper.amdsmi_get_fw_info( processor_handle, ctypes.byref(fw_info) ) ) # Certain FW blocks are padded with 0s in the front intentionally # But the C library converts the hex to an integer which trims the leading 0s # Nor do we have a flag that defines the expected format for each FW block # We can expect the following blocks to have a padded value and a specified format hex_format_fw = [AmdSmiFwBlock.AMDSMI_FW_ID_PSP_SOSDRV, AmdSmiFwBlock.AMDSMI_FW_ID_TA_RAS, AmdSmiFwBlock.AMDSMI_FW_ID_TA_XGMI, AmdSmiFwBlock.AMDSMI_FW_ID_UVD, AmdSmiFwBlock.AMDSMI_FW_ID_VCE, AmdSmiFwBlock.AMDSMI_FW_ID_VCN] # PM(AKA: SMC) firmware's hex value looks like 0x12345678 # However, they are parsed as: int(0x12).int(0x34).int(0x56).int(0x78) # Which results in the following: 12.34.56.78 dec_format_fw = [AmdSmiFwBlock.AMDSMI_FW_ID_PM, AmdSmiFwBlock.AMDSMI_FW_ID_PLDM_BUNDLE] firmwares = [] for i in range(0, fw_info.num_fw_info): fw_name = AmdSmiFwBlock(fw_info.fw_info_list[i].fw_id) fw_version = fw_info.fw_info_list[i].fw_version # This is in int format (base 10) if fw_name in hex_format_fw: # Convert the fw_version from a int to a hex string padded leading 0s fw_version_string = hex(fw_version)[2:].zfill(8) # Join every two hex digits with a dot fw_version_string = ".".join(re.findall('..?', fw_version_string)) elif fw_name in dec_format_fw: # Convert the fw_version from a int to a hex string padded leading 0s fw_version_string = hex(fw_version)[2:].zfill(8) # Convert every two hex digits to decimal and join them with a dot dec_version_string = '' for index, _ in enumerate(fw_version_string): if index % 2 != 0: continue hex_digits = f"0x{fw_version_string[index:index+2]}" dec_version_string += str(int(hex_digits, 16)).zfill(2) + "." fw_version_string = dec_version_string.strip('.') else: fw_version_string = str(fw_version) firmwares.append({ 'fw_name': fw_name, 'fw_version': fw_version_string.upper(), }) return {'fw_list': firmwares} def amdsmi_get_gpu_vram_usage( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) vram_usage = amdsmi_wrapper.amdsmi_vram_usage_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_vram_usage( processor_handle, ctypes.byref(vram_usage)) ) return {"vram_total": vram_usage.vram_total, "vram_used": vram_usage.vram_used} def amdsmi_get_pcie_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pcie_info = amdsmi_wrapper.amdsmi_pcie_info_t() _check_res( amdsmi_wrapper.amdsmi_get_pcie_info( processor_handle, ctypes.byref(pcie_info) ) ) pcie_info_dict = { "pcie_static": { "max_pcie_width": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_width, MaxUIntegerTypes.UINT16_T), "max_pcie_speed": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_speed, MaxUIntegerTypes.UINT32_T), "pcie_interface_version": _validate_if_max_uint(pcie_info.pcie_static.pcie_interface_version, MaxUIntegerTypes.UINT32_T), "slot_type": pcie_info.pcie_static.slot_type, }, "pcie_metric": { "pcie_width": _validate_if_max_uint(pcie_info.pcie_metric.pcie_width, MaxUIntegerTypes.UINT16_T), "pcie_speed": _validate_if_max_uint(pcie_info.pcie_metric.pcie_speed, MaxUIntegerTypes.UINT32_T), "pcie_bandwidth": _validate_if_max_uint(pcie_info.pcie_metric.pcie_bandwidth, MaxUIntegerTypes.UINT32_T), "pcie_replay_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_count, MaxUIntegerTypes.UINT64_T), "pcie_l0_to_recovery_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_l0_to_recovery_count, MaxUIntegerTypes.UINT64_T), "pcie_replay_roll_over_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_roll_over_count, MaxUIntegerTypes.UINT64_T), "pcie_nak_sent_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_sent_count, MaxUIntegerTypes.UINT64_T), "pcie_nak_received_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_received_count, MaxUIntegerTypes.UINT64_T), "pcie_lc_perf_other_end_recovery_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_lc_perf_other_end_recovery_count, MaxUIntegerTypes.UINT32_T) } } slot_type = pcie_info_dict['pcie_static']['slot_type'] if isinstance(slot_type, int): slot_types = amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues if slot_type in slot_types: pcie_info_dict['pcie_static']['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "") else: pcie_info_dict['pcie_static']['slot_type'] = "Unknown" else: pcie_info_dict['pcie_static']['slot_type'] = "N/A" return pcie_info_dict def amdsmi_get_gpu_xcd_counter(processor_handle: processor_handle_t) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) xcd_counter = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_xcd_counter( processor_handle, ctypes.byref(xcd_counter) ) ) return xcd_counter.value def amdsmi_get_processor_handle_from_bdf(bdf): bdf = _parse_bdf(bdf) if bdf is None: raise AmdSmiBdfFormatException(bdf) amdsmi_bdf = _make_amdsmi_bdf_from_list(bdf) processor_handle = amdsmi_wrapper.amdsmi_processor_handle() _check_res(amdsmi_wrapper.amdsmi_get_processor_handle_from_bdf( amdsmi_bdf, ctypes.byref(processor_handle))) return processor_handle def amdsmi_get_gpu_vendor_name( processor_handle: processor_handle_t, ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) length = ctypes.c_uint64() length.value = _AMDSMI_STRING_LENGTH vendor_name = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) _check_res( amdsmi_wrapper.amdsmi_get_gpu_vendor_name( processor_handle, vendor_name, length) ) return vendor_name.value.decode("utf-8") def amdsmi_get_gpu_id(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) gpu_id_16 = ctypes.c_uint16() _check_res(amdsmi_wrapper.amdsmi_get_gpu_id( processor_handle, ctypes.byref(gpu_id_16))) return gpu_id_16.value def amdsmi_get_gpu_vram_vendor(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) length = ctypes.c_uint32() length.value = _AMDSMI_STRING_LENGTH vram_vendor = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) _check_res( amdsmi_wrapper.amdsmi_get_gpu_vram_vendor( processor_handle, vram_vendor, length) ) return vram_vendor.value.decode("utf-8") def amdsmi_get_gpu_subsystem_id(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) subsystem_id_16 = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_subsystem_id( processor_handle, ctypes.byref(subsystem_id_16)) ) return _pad_hex_value(hex(subsystem_id_16.value), 4) def amdsmi_get_gpu_subsystem_name(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) length = ctypes.c_uint64() length.value = _AMDSMI_STRING_LENGTH name = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) _check_res( amdsmi_wrapper.amdsmi_get_gpu_subsystem_name( processor_handle, name, length) ) return name.value.decode("utf-8") def amdsmi_get_lib_version(): version = amdsmi_wrapper.amdsmi_version_t() _check_res(amdsmi_wrapper.amdsmi_get_lib_version(ctypes.byref(version))) return { "major": version.major, "minor": version.minor, "release": version.release, "build": version.build.contents.value.decode("utf-8") } def amdsmi_topo_get_numa_node_number( processor_handle: processor_handle_t, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) numa_node_number = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_topo_get_numa_node_number( processor_handle, ctypes.byref(numa_node_number) ) ) return numa_node_number.value def amdsmi_topo_get_link_weight( processor_handle_src: processor_handle_t, processor_handle_dst: processor_handle_t ): if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle ) weight = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_topo_get_link_weight( processor_handle_src, processor_handle_dst, ctypes.byref(weight) ) ) return weight.value def amdsmi_get_minmax_bandwidth_between_processors( processor_handle_src: processor_handle_t, processor_handle_dst: processor_handle_t, ): if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle ) min_bandwidth = ctypes.c_uint64() max_bandwidth = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_minmax_bandwidth_between_processors( processor_handle_src, processor_handle_dst, ctypes.byref(min_bandwidth), ctypes.byref(max_bandwidth), ) ) return {"min_bandwidth": min_bandwidth.value, "max_bandwidth": max_bandwidth.value} def amdsmi_get_link_metrics(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) link_metrics = amdsmi_wrapper.amdsmi_link_metrics_t() _check_res( amdsmi_wrapper.amdsmi_get_link_metrics( processor_handle, ctypes.byref(link_metrics) ) ) links = [] for i in range(AMDSMI_MAX_NUM_XGMI_LINKS): link = link_metrics.links[i] links.append({ "bdf": _format_bdf(link.bdf), "bit_rate": link.bit_rate, "max_bandwidth": link.max_bandwidth, "link_type": link.link_type, "read": link.read, "write": link.write, }) return { "num_links": link_metrics.num_links, "links": links } def amdsmi_topo_get_link_type( processor_handle_src: processor_handle_t, processor_handle_dst: processor_handle_t, ): if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle ) hops_64 = ctypes.c_uint64() type_32 = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_topo_get_link_type( processor_handle_src, processor_handle_dst, ctypes.byref(hops_64), ctypes.byref(type_32) ) ) return {"hops": hops_64.value, "type": type_32.value} def amdsmi_topo_get_p2p_status( processor_handle_src: processor_handle_t, processor_handle_dst: processor_handle_t, ): if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle ) type_32 = ctypes.c_uint32() cap = amdsmi_wrapper.struct_amdsmi_p2p_capability_t() _check_res( amdsmi_wrapper.amdsmi_topo_get_p2p_status( processor_handle_src, processor_handle_dst, ctypes.byref(type_32), ctypes.byref(cap) ) ) return { 'type' : type, 'cap': { 'is_iolink_coherent': cap.is_iolink_coherent, 'is_iolink_atomics_32bit': cap.is_iolink_atomics_32bit, 'is_iolink_atomics_64bit': cap.is_iolink_atomics_64bit, 'is_iolink_dma': cap.is_iolink_dma, 'is_iolink_bi_directional': cap.is_iolink_bi_directional } } def amdsmi_is_P2P_accessible( processor_handle_src: processor_handle_t, processor_handle_dst: processor_handle_t, ): if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle ) accessible = ctypes.c_bool() _check_res( amdsmi_wrapper.amdsmi_is_P2P_accessible( processor_handle_src, processor_handle_dst, ctypes.byref(accessible) ) ) return accessible.value def amdsmi_get_gpu_compute_partition(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) length = ctypes.c_uint32() length.value = _AMDSMI_STRING_LENGTH compute_partition = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_partition( processor_handle, compute_partition, length ) ) return compute_partition.value.decode("utf-8") def amdsmi_set_gpu_compute_partition(processor_handle: processor_handle_t, compute_partition: AmdSmiComputePartitionType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(compute_partition, AmdSmiComputePartitionType): raise AmdSmiParameterException(compute_partition, AmdSmiComputePartitionType) _check_res( amdsmi_wrapper.amdsmi_set_gpu_compute_partition( processor_handle, compute_partition ) ) def amdsmi_set_gpu_accelerator_partition_profile(processor_handle: processor_handle_t, profile_index: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(profile_index, int): raise AmdSmiParameterException(profile_index, int) _check_res( amdsmi_wrapper.amdsmi_set_gpu_accelerator_partition_profile( processor_handle, profile_index ) ) def amdsmi_get_gpu_memory_partition(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) length = ctypes.c_uint32() length.value = _AMDSMI_STRING_LENGTH memory_partition = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_partition( processor_handle, memory_partition, length ) ) return memory_partition.value.decode("utf-8") def amdsmi_get_gpu_memory_partition_config(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) config = amdsmi_wrapper.amdsmi_memory_partition_config_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_partition_config( processor_handle, config ) ) mem_caps_list = [] if config.partition_caps.nps_flags.nps1_cap == 1: mem_caps_list.append("NPS1") if config.partition_caps.nps_flags.nps2_cap == 1: mem_caps_list.append("NPS2") if config.partition_caps.nps_flags.nps4_cap == 1: mem_caps_list.append("NPS4") if config.partition_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") if (config.partition_caps.nps_flags.nps1_cap == 0 and config.partition_caps.nps_flags.nps2_cap == 0 and config.partition_caps.nps_flags.nps4_cap == 0 and config.partition_caps.nps_flags.nps8_cap == 0): mem_caps_list.append("N/A") return_dict = { "partition_caps": mem_caps_list, "mp_mode": amdsmi_wrapper.amdsmi_memory_partition_type_t__enumvalues[ config.mp_mode].replace("AMDSMI_MEMORY_PARTITION_", "").replace("UNKNOWN", "N/A"), "num_numa_ranges": "N/A", "numa_range": "N/A", } return return_dict def amdsmi_set_gpu_memory_partition(processor_handle: processor_handle_t, memory_partition: AmdSmiMemoryPartitionType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(memory_partition, AmdSmiMemoryPartitionType): raise AmdSmiParameterException(memory_partition, AmdSmiMemoryPartitionType) _check_res( amdsmi_wrapper.amdsmi_set_gpu_memory_partition( processor_handle, memory_partition ) ) def amdsmi_set_gpu_memory_partition_mode(processor_handle: processor_handle_t, memory_partition: AmdSmiMemoryPartitionType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(memory_partition, AmdSmiMemoryPartitionType): raise AmdSmiParameterException(memory_partition, AmdSmiMemoryPartitionType) _check_res( amdsmi_wrapper.amdsmi_set_gpu_memory_partition( processor_handle, memory_partition ) ) def amdsmi_get_gpu_accelerator_partition_profile( processor_handle: processor_handle_t ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) exception_caught = False return_dictionary = {} length = 8 partition_id = [0, 0, 0, 0, 0, 0, 0, 0] partition_id_list = (ctypes.c_uint32 * length)(*partition_id) profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t() partition_ids = [] kPOSITION_OF_PARTITION_ID = 0 ret = amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, ctypes.byref(profile), partition_id_list) if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: #partition_id[0] will contain the partition id of each device #BM/Guest will include this logic. Host will only display primary partition ids. partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) try: _check_res(ret) except AmdSmiException as e: partition_profile_dict = { "profile_type" : "N/A", "num_partitions" : "N/A", "profile_index" : "N/A", "memory_caps": "N/A", "num_resources" : "N/A", "resources" : "N/A" } return_dictionary = { "partition_id" : partition_ids, "partition_profile" : partition_profile_dict } if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: exception_caught = True else: _check_res(ret) # re-raise the exception if error is anything other than AMDSMI_STATUS_NOT_SUPPORTED # this ensures we can get partition ID even if the profile is not supported. finally: if not exception_caught: profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") profile_type_ret = profile_type_ret.replace("INVALID", "N/A") length = profile.num_partitions #partition_id[0] will contain the partition id of each device #BM/Guest will include this logic. Host will only display primary partition ids. partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) mem_caps_list = [] if profile.memory_caps.nps_flags.nps1_cap == 1: mem_caps_list.append("NPS1") if profile.memory_caps.nps_flags.nps2_cap == 1: mem_caps_list.append("NPS2") if profile.memory_caps.nps_flags.nps4_cap == 1: mem_caps_list.append("NPS4") if profile.memory_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") if (profile.memory_caps.nps_flags.nps1_cap == 0 and profile.memory_caps.nps_flags.nps2_cap == 0 and profile.memory_caps.nps_flags.nps4_cap == 0 and profile.memory_caps.nps_flags.nps8_cap == 0): mem_caps_list.append("N/A") partition_profile_dict = { "profile_type" : profile_type_ret, "num_partitions" : profile.num_partitions, "profile_index" : profile.profile_index, "memory_caps": mem_caps_list, "num_resources" : profile.num_resources, "resources" : "N/A" } return_dictionary = { "partition_id" : partition_ids, "partition_profile" : partition_profile_dict } return return_dictionary def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: processor_handle_t) -> Dict: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) config = amdsmi_wrapper.amdsmi_accelerator_partition_profile_config_t() _check_res(amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle, ctypes.byref(config))) profiles = [] resources = [] resource_idx = 0 for i in range(config.num_profiles): profile = config.profiles[i] profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[ config.profiles[i].profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") profile_type_ret = profile_type_ret.replace("INVALID", "N/A") mem_caps_list = [] if profile.memory_caps.nps_flags.nps1_cap == 1: mem_caps_list.append("NPS1") if profile.memory_caps.nps_flags.nps2_cap == 1: mem_caps_list.append("NPS2") if profile.memory_caps.nps_flags.nps4_cap == 1: mem_caps_list.append("NPS4") if profile.memory_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") if (profile.memory_caps.nps_flags.nps1_cap == 0 and profile.memory_caps.nps_flags.nps2_cap == 0 and profile.memory_caps.nps_flags.nps4_cap == 0 and profile.memory_caps.nps_flags.nps8_cap == 0): mem_caps_list.append("N/A") resources = [] for _ in range(config.num_resource_profiles): res_profile = config.resource_profiles[resource_idx] resource_profiles_ret = amdsmi_wrapper.amdsmi_accelerator_partition_resource_type_t__enumvalues[ res_profile.resource_type].replace("AMDSMI_ACCELERATOR_", "") resource_profile_dict = { "profile_index": res_profile.profile_index, "resource_type": resource_profiles_ret, "partition_resource": res_profile.partition_resource, "num_partitions_share_resource": res_profile.num_partitions_share_resource, } resources.append(resource_profile_dict) resource_idx += 1 profile_dict = { "profile_type": profile_type_ret, "num_partitions": profile.num_partitions, "profile_index": profile.profile_index, "memory_caps": mem_caps_list, "num_resources": profile.num_resources, "resources": resources } profiles.append(profile_dict) config_dict = { "num_profiles": config.num_profiles, "num_resource_profiles": config.num_resource_profiles, "resource_profiles": resources, "default_profile_index": config.default_profile_index, "profiles": profiles, } return config_dict def amdsmi_get_xgmi_info(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) xgmi_info = amdsmi_wrapper.amdsmi_xgmi_info_t() _check_res(amdsmi_wrapper.amdsmi_get_xgmi_info(processor_handle, xgmi_info)) return { "xgmi_lanes": xgmi_info.xgmi_lanes, "xgmi_hive_id": xgmi_info.xgmi_hive_id, "xgmi_node_id": xgmi_info.xgmi_node_id, "index": xgmi_info.index, } def amdsmi_gpu_counter_group_supported( processor_handle: processor_handle_t, event_group: AmdSmiEventGroup, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(event_group, AmdSmiEventGroup): raise AmdSmiParameterException(event_group, AmdSmiEventGroup) _check_res( amdsmi_wrapper.amdsmi_gpu_counter_group_supported( processor_handle, event_group) ) def amdsmi_gpu_create_counter( processor_handle: processor_handle_t, event_type: AmdSmiEventType, ) -> amdsmi_wrapper.amdsmi_event_handle_t: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(event_type, AmdSmiEventType): raise AmdSmiParameterException(event_type, AmdSmiEventType) event_handle = amdsmi_wrapper.amdsmi_event_handle_t() _check_res( amdsmi_wrapper.amdsmi_gpu_create_counter( processor_handle, event_type, ctypes.byref(event_handle) ) ) return event_handle def amdsmi_gpu_destroy_counter(event_handle: amdsmi_wrapper.amdsmi_event_handle_t) -> None: if not isinstance(event_handle, amdsmi_wrapper.amdsmi_event_handle_t): raise AmdSmiParameterException(event_handle, amdsmi_wrapper.amdsmi_event_handle_t) _check_res(amdsmi_wrapper.amdsmi_gpu_destroy_counter(event_handle)) def amdsmi_gpu_control_counter( event_handle: amdsmi_wrapper.amdsmi_event_handle_t, counter_command: AmdSmiCounterCommand, ): if not isinstance(event_handle, amdsmi_wrapper.amdsmi_event_handle_t): raise AmdSmiParameterException(event_handle, amdsmi_wrapper.amdsmi_event_handle_t) if not isinstance(counter_command, AmdSmiCounterCommand): raise AmdSmiParameterException(counter_command, AmdSmiCounterCommand) event_handle_value = amdsmi_wrapper.amdsmi_event_handle_t(event_handle.value) command_args = ctypes.c_void_p() _check_res( amdsmi_wrapper.amdsmi_gpu_control_counter( event_handle_value, counter_command, command_args ) ) def amdsmi_gpu_read_counter( event_handle: amdsmi_wrapper.amdsmi_event_handle_t, ) -> Dict[str, Any]: if not isinstance(event_handle, amdsmi_wrapper.amdsmi_event_handle_t): raise AmdSmiParameterException(event_handle, amdsmi_wrapper.amdsmi_event_handle_t) counter_value = amdsmi_wrapper.amdsmi_counter_value_t() _check_res( amdsmi_wrapper.amdsmi_gpu_read_counter( event_handle, ctypes.byref(counter_value)) ) return { "value": counter_value.value, "time_enabled": counter_value.time_enabled, "time_running": counter_value.time_running, } def amdsmi_get_gpu_available_counters( processor_handle: processor_handle_t, event_group: AmdSmiEventGroup, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(event_group, AmdSmiEventGroup): raise AmdSmiParameterException(event_group, AmdSmiEventGroup) available = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_available_counters( processor_handle, event_group, ctypes.byref(available) ) ) return available.value def amdsmi_set_gpu_perf_level( processor_handle: processor_handle_t, perf_level: AmdSmiDevPerfLevel, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(perf_level, AmdSmiDevPerfLevel): raise AmdSmiParameterException(perf_level, AmdSmiDevPerfLevel) _check_res(amdsmi_wrapper.amdsmi_set_gpu_perf_level( processor_handle, perf_level)) def amdsmi_reset_gpu(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res(amdsmi_wrapper.amdsmi_reset_gpu(processor_handle)) def amdsmi_gpu_driver_reload(): _check_res(amdsmi_wrapper.amdsmi_gpu_driver_reload()) def amdsmi_set_gpu_fan_speed( processor_handle: processor_handle_t, sensor_idx: int, fan_speed: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) if not isinstance(fan_speed, int): raise AmdSmiParameterException(fan_speed, int) sensor_idx_32 = ctypes.c_uint32(sensor_idx) fan_speed_64 = ctypes.c_uint64(fan_speed) _check_res( amdsmi_wrapper.amdsmi_set_gpu_fan_speed( processor_handle, sensor_idx_32, fan_speed_64) ) def amdsmi_reset_gpu_fan( processor_handle: processor_handle_t, sensor_idx: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) sensor_idx_32 = ctypes.c_uint32(sensor_idx) _check_res(amdsmi_wrapper.amdsmi_reset_gpu_fan(processor_handle, sensor_idx_32)) def amdsmi_set_clk_freq( processor_handle: processor_handle_t, clk_type: str, freq_bitmask: int, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(clk_type, str): raise AmdSmiParameterException(clk_type, str) if not isinstance(freq_bitmask, int): raise AmdSmiParameterException(freq_bitmask, int) if clk_type.lower() == "sclk": clk_type_conversion = AmdSmiClkType.SYS elif clk_type.lower() == "mclk": clk_type_conversion = AmdSmiClkType.MEM elif clk_type.lower() == "fclk": clk_type_conversion = AmdSmiClkType.DF elif clk_type.lower() == "socclk": clk_type_conversion = AmdSmiClkType.SOC else: clk_type_conversion = "N/A" if not isinstance(clk_type_conversion, AmdSmiClkType): raise AmdSmiParameterException(clk_type_conversion, AmdSmiClkType) freq_bitmask_64 = ctypes.c_uint64(freq_bitmask) _check_res( amdsmi_wrapper.amdsmi_set_clk_freq( processor_handle, clk_type_conversion, freq_bitmask_64 ) ) def amdsmi_set_soc_pstate( processor_handle: processor_handle_t, policy_id: int, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(policy_id, int): raise AmdSmiParameterException(policy_id, int) _check_res( amdsmi_wrapper.amdsmi_set_soc_pstate( processor_handle, policy_id ) ) def amdsmi_set_xgmi_plpd( processor_handle: processor_handle_t, policy_id: int, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(policy_id, int): raise AmdSmiParameterException(policy_id, int) _check_res( amdsmi_wrapper.amdsmi_set_xgmi_plpd( processor_handle, policy_id ) ) return def amdsmi_set_gpu_process_isolation( processor_handle: processor_handle_t, pisolate: int, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(pisolate, int): raise AmdSmiParameterException(pisolate, int) _check_res( amdsmi_wrapper.amdsmi_set_gpu_process_isolation( processor_handle, pisolate ) ) def amdsmi_clean_gpu_local_data( processor_handle: processor_handle_t, ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res( amdsmi_wrapper.amdsmi_clean_gpu_local_data( processor_handle ) ) def amdsmi_set_gpu_overdrive_level( processor_handle: processor_handle_t, overdrive_value: int ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(overdrive_value, int): raise AmdSmiParameterException(overdrive_value, int) overdrive_value_32 = ctypes.c_uint32(overdrive_value) _check_res( amdsmi_wrapper.amdsmi_set_gpu_overdrive_level( processor_handle, overdrive_value_32) ) def amdsmi_get_gpu_bdf_id(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) bdfid = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bdf_id( processor_handle, ctypes.byref(bdfid)) ) return bdfid.value def amdsmi_set_gpu_pci_bandwidth( processor_handle: processor_handle_t, bitmask: int ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(bitmask, int): raise AmdSmiParameterException(bitmask, int) _check_res( amdsmi_wrapper.amdsmi_set_gpu_pci_bandwidth( processor_handle, ctypes.c_uint64(bitmask) ) ) def _format_transfer_rate(transfer_rate): return { 'num_supported': transfer_rate.num_supported, 'current': transfer_rate.current, 'frequency': list(transfer_rate.frequency) } def amdsmi_get_gpu_pci_bandwidth(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) bandwidth = amdsmi_wrapper.amdsmi_pcie_bandwidth_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_pci_bandwidth( processor_handle, ctypes.byref(bandwidth)) ) transfer_rate = _format_transfer_rate(bandwidth.transfer_rate) return { 'transfer_rate': transfer_rate, 'lanes': list(bandwidth.lanes) } def amdsmi_get_gpu_pci_throughput(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) sent = ctypes.c_uint64() received = ctypes.c_uint64() max_pkt_sz = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_pci_throughput(processor_handle, ctypes.byref( sent), ctypes.byref(received), ctypes.byref(max_pkt_sz)) ) return { 'sent': sent.value, 'received': received.value, 'max_pkt_sz': max_pkt_sz.value } def amdsmi_get_gpu_pci_replay_counter(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) counter = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_pci_replay_counter( processor_handle, ctypes.byref(counter)) ) return counter.value def amdsmi_get_gpu_topo_numa_affinity(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) numa_node = ctypes.c_int32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_topo_numa_affinity( processor_handle, ctypes.byref(numa_node)) ) return numa_node.value def amdsmi_set_power_cap( processor_handle: processor_handle_t, sensor_ind: int, cap: int ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_ind, int): raise AmdSmiParameterException(sensor_ind, int) if not isinstance(cap, int): raise AmdSmiParameterException(cap, int) _check_res( amdsmi_wrapper.amdsmi_set_power_cap( processor_handle, ctypes.c_uint32(sensor_ind), ctypes.c_uint64(cap) ) ) def amdsmi_set_gpu_power_profile( processor_handle: processor_handle_t, reserved: int, profile: AmdSmiPowerProfilePresetMasks, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(reserved, int): raise AmdSmiParameterException(reserved, int) if not isinstance(profile, AmdSmiPowerProfilePresetMasks): raise AmdSmiParameterException(profile, AmdSmiPowerProfilePresetMasks) _check_res( amdsmi_wrapper.amdsmi_set_gpu_power_profile( processor_handle, ctypes.c_uint32(reserved), profile ) ) def amdsmi_get_energy_count(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) energy_accumulator= ctypes.c_uint64() counter_resolution = ctypes.c_float() timestamp = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_energy_count(processor_handle, ctypes.byref( energy_accumulator), ctypes.byref(counter_resolution), ctypes.byref(timestamp)) ) return { 'energy_accumulator': energy_accumulator.value, 'counter_resolution': counter_resolution.value, 'timestamp': timestamp.value, } def amdsmi_set_gpu_clk_range( processor_handle: processor_handle_t, min_clk_value: int, max_clk_value: int, clk_type: AmdSmiClkType, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(min_clk_value, int): raise AmdSmiParameterException(min_clk_value, int) if not isinstance(max_clk_value, int): raise AmdSmiParameterException(min_clk_value, int) if not isinstance(clk_type, AmdSmiClkType): raise AmdSmiParameterException(clk_type, AmdSmiClkType) _check_res( amdsmi_wrapper.amdsmi_set_gpu_clk_range( processor_handle, ctypes.c_uint64(min_clk_value), ctypes.c_uint64(max_clk_value), clk_type, ) ) def amdsmi_set_gpu_clk_limit( processor_handle: processor_handle_t, clk_type: str, limit_type: str, value: int ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(clk_type, str): raise AmdSmiParameterException(clk_type, str) if not isinstance(limit_type, str): raise AmdSmiParameterException(limit_type, str) if not isinstance(value, int): raise AmdSmiParameterException(value, int) if clk_type.lower() == "sclk": clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_SYS elif clk_type.lower() == "mclk": clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_MEM if limit_type.lower() == "min": limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MIN elif limit_type.lower() == "max": limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MAX _check_res( amdsmi_wrapper.amdsmi_set_gpu_clk_limit( processor_handle, clk_type_conversion, limit_type_conversion, ctypes.c_uint64(value), ) ) def amdsmi_get_gpu_memory_total(processor_handle: processor_handle_t, mem_type: AmdSmiMemoryType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(mem_type, AmdSmiMemoryType): raise AmdSmiParameterException( mem_type, AmdSmiMemoryType ) total = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_total( processor_handle, mem_type, ctypes.byref(total)) ) return total.value def amdsmi_set_gpu_od_clk_info( processor_handle: processor_handle_t, level: AmdSmiFreqInd, value: int, clk_type: AmdSmiClkType, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(level, AmdSmiFreqInd): raise AmdSmiParameterException(level, AmdSmiFreqInd) if not isinstance(value, int): raise AmdSmiParameterException(value, int) if not isinstance(clk_type, AmdSmiClkType): raise AmdSmiParameterException(clk_type, AmdSmiClkType) _check_res( amdsmi_wrapper.amdsmi_set_gpu_od_clk_info( processor_handle, level, ctypes.c_uint64(value), clk_type ) ) def amdsmi_get_gpu_memory_usage(processor_handle: processor_handle_t, mem_type: AmdSmiMemoryType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(mem_type, AmdSmiMemoryType): raise AmdSmiParameterException( mem_type, AmdSmiMemoryType ) used = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_usage( processor_handle, mem_type, ctypes.byref(used)) ) return used.value def amdsmi_set_gpu_od_volt_info( processor_handle: processor_handle_t, vpoint: int, clk_value: int, volt_value: int, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(vpoint, int): raise AmdSmiParameterException(vpoint, int) if not isinstance(clk_value, int): raise AmdSmiParameterException(clk_value, int) if not isinstance(volt_value, int): raise AmdSmiParameterException(volt_value, int) _check_res( amdsmi_wrapper.amdsmi_set_gpu_od_volt_info( processor_handle, ctypes.c_uint32(vpoint), ctypes.c_uint64(clk_value), ctypes.c_uint64(volt_value), ) ) def amdsmi_get_gpu_fan_rpms( processor_handle: processor_handle_t, sensor_idx: int ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) fan_speed = ctypes.c_int64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_fan_rpms( processor_handle, sensor_idx, ctypes.byref(fan_speed) ) ) return fan_speed.value def amdsmi_get_gpu_fan_speed( processor_handle: processor_handle_t, sensor_idx: int ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) fan_speed = ctypes.c_int64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_fan_speed( processor_handle, sensor_idx, ctypes.byref(fan_speed) ) ) return fan_speed.value def amdsmi_get_gpu_fan_speed_max( processor_handle: processor_handle_t, sensor_idx: int ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) fan_speed = ctypes.c_uint64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_fan_speed_max( processor_handle, sensor_idx, ctypes.byref(fan_speed) ) ) return fan_speed.value def amdsmi_get_node_handle(processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) node_handle = amdsmi_wrapper.amdsmi_node_handle() _check_res( amdsmi_wrapper.amdsmi_get_node_handle(processor_handle, ctypes.byref(node_handle)) ) return node_handle def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]: if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle): raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle) npm_info = amdsmi_wrapper.amdsmi_npm_info_t() _check_res( amdsmi_wrapper.amdsmi_get_npm_info( node_handle, ctypes.byref(npm_info) ) ) dict_ret = { "limit": npm_info.limit, "status": npm_info.status, } return dict_ret def amdsmi_get_temp_metric( processor_handle: processor_handle_t, sensor_type: AmdSmiTemperatureType, metric: AmdSmiTemperatureMetric, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_type, AmdSmiTemperatureType): raise AmdSmiParameterException(sensor_type, AmdSmiTemperatureType) if not isinstance(metric, AmdSmiTemperatureMetric): raise AmdSmiParameterException(metric, AmdSmiTemperatureMetric) temp_value = ctypes.c_int64() _check_res( amdsmi_wrapper.amdsmi_get_temp_metric( processor_handle, sensor_type, metric, ctypes.byref(temp_value) ) ) return temp_value.value def amdsmi_get_gpu_volt_metric( processor_handle: processor_handle_t, sensor_type: AmdSmiVoltageType, metric: AmdSmiVoltageMetric, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_type, AmdSmiVoltageType): raise AmdSmiParameterException(sensor_type, AmdSmiVoltageType) if not isinstance(metric, AmdSmiVoltageMetric): raise AmdSmiParameterException(metric, AmdSmiVoltageMetric) voltage = ctypes.c_int64() _check_res( amdsmi_wrapper.amdsmi_get_gpu_volt_metric( processor_handle, sensor_type, metric, ctypes.byref(voltage) ) ) return voltage.value def amdsmi_get_utilization_count( processor_handle: processor_handle_t, counter_types: List[AmdSmiUtilizationCounterType] ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) # Enforce List typing if not isinstance(counter_types, list): counter_types = [counter_types] counter_types = list(set(counter_types)) # Validate Inputs if len(counter_types) == 0: raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_INVAL) counters = [] for counter_type in counter_types: if not isinstance(counter_type, AmdSmiUtilizationCounterType): raise AmdSmiParameterException( counter_type, AmdSmiUtilizationCounterType) counter = amdsmi_wrapper.amdsmi_utilization_counter_t() counter.type = counter_type counters.append(counter) count = ctypes.c_uint32(len(counters)) timestamp = ctypes.c_uint64() util_counter_list = (amdsmi_wrapper.amdsmi_utilization_counter_t * len(counters))(*counters) _check_res( amdsmi_wrapper.amdsmi_get_utilization_count( processor_handle, util_counter_list, count, ctypes.byref(timestamp) ) ) if count.value != len(counters): raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_API_FAILED) result = [] result.append({"timestamp": timestamp.value}) for index in range(count.value): counter_type = amdsmi_wrapper.amdsmi_utilization_counter_type_t__enumvalues[ util_counter_list[index].type ] if counter_type == "AMDSMI_UTILIZATION_COUNTER_FIRST": counter_type = "AMDSMI_COARSE_GRAIN_GPU_ACTIVITY" if counter_type == "AMDSMI_UTILIZATION_COUNTER_LAST": counter_type = "AMDSMI_FINE_DECODER_ACTIVITY" result.append( {"type": counter_type, "value": util_counter_list[index].value}) return result def amdsmi_get_gpu_perf_level( processor_handle: processor_handle_t, ) -> str: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) perf = amdsmi_wrapper.amdsmi_dev_perf_level_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_perf_level( processor_handle, ctypes.byref(perf)) ) result = amdsmi_wrapper.amdsmi_dev_perf_level_t__enumvalues[perf.value] if result == "AMDSMI_DEV_PERF_LEVEL_FIRST": result = "AMDSMI_DEV_PERF_LEVEL_AUTO" if result == "AMDSMI_DEV_PERF_LEVEL_LAST": result = "AMDSMI_DEV_PERF_LEVEL_DETERMINISM" return result def amdsmi_set_gpu_perf_determinism_mode( processor_handle: processor_handle_t, clkvalue: int ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(clkvalue, int): raise AmdSmiParameterException(clkvalue, int) _check_res(amdsmi_wrapper.amdsmi_set_gpu_perf_determinism_mode( processor_handle, clkvalue)) def amdsmi_get_gpu_overdrive_level( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) od_level = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_overdrive_level( processor_handle, ctypes.byref(od_level) ) ) return od_level.value def amdsmi_get_gpu_mem_overdrive_level( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) mem_od_level = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_mem_overdrive_level( processor_handle, ctypes.byref(mem_od_level) ) ) return mem_od_level.value def amdsmi_get_clk_freq( processor_handle: processor_handle_t, clk_type: AmdSmiClkType ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(clk_type, AmdSmiClkType): raise AmdSmiParameterException(clk_type, AmdSmiClkType) freq = amdsmi_wrapper.amdsmi_frequencies_t() _check_res( amdsmi_wrapper.amdsmi_get_clk_freq( processor_handle, clk_type, ctypes.byref(freq) ) ) dict_ret = { "num_supported": freq.num_supported, "current": freq.current, "frequency": list(freq.frequency)[: freq.num_supported], } return dict_ret def amdsmi_get_soc_pstate( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) policy = amdsmi_wrapper.amdsmi_dpm_policy_t() _check_res( amdsmi_wrapper.amdsmi_get_soc_pstate( processor_handle, ctypes.byref(policy) ) ) polices = [] for i in range(0, policy.num_supported): policy_id = policy.policies[i].policy_id desc = policy.policies[i].policy_description polices.append({ 'policy_id' : policy_id, 'policy_description': desc.decode() }) current_id = policy.policies[policy.current].policy_id return { "num_supported": policy.num_supported, "current_id": current_id, "policies": polices, } def amdsmi_get_xgmi_plpd( processor_handle: processor_handle_t ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) policy = amdsmi_wrapper.amdsmi_dpm_policy_t() _check_res( amdsmi_wrapper.amdsmi_get_xgmi_plpd(processor_handle, ctypes.byref(policy)) ) policies = [] for i in range(policy.num_supported): try: # Access the policy entry directly policy_entry = policy.policies[i] policy_id = policy_entry.policy_id # Handle the policy description more carefully policy_desc_bytes = policy_entry.policy_description if policy_desc_bytes: # Convert ctypes array to bytes and decode policy_desc = ctypes.string_at(policy_desc_bytes).decode('utf-8').rstrip('\x00') else: policy_desc = "" policies.append({ 'policy_id': policy_id, 'policy_description': policy_desc }) except (UnicodeDecodeError, AttributeError, ValueError): # Fallback for problematic entries policies.append({ 'policy_id': 0, # Default fallback 'policy_description': "" }) # Get current policy ID correctly if policy.current < policy.num_supported: current_id = policy.policies[policy.current].policy_id else: current_id = 0 # Fallback return { "num_supported": policy.num_supported, "current_id": current_id, "plpds": policies, # Marked for deprecation "policies": policies, # Correct field name } def amdsmi_get_gpu_process_isolation( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) pisolate = ctypes.c_uint32() _check_res( amdsmi_wrapper.amdsmi_get_gpu_process_isolation( processor_handle, ctypes.byref(pisolate) ) ) return pisolate.value def amdsmi_get_gpu_od_volt_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) freq_data = amdsmi_wrapper.amdsmi_od_volt_freq_data_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_od_volt_info( processor_handle, ctypes.byref(freq_data) ) ) sclk_lower = freq_data.curr_sclk_range.lower_bound sclk_upper = freq_data.curr_sclk_range.upper_bound mclk_lower = freq_data.curr_mclk_range.lower_bound mclk_upper = freq_data.curr_mclk_range.upper_bound if sclk_lower == MaxUIntegerTypes.UINT64_T: sclk_lower = "N/A" if sclk_upper == MaxUIntegerTypes.UINT64_T: sclk_upper = "N/A" if mclk_lower == MaxUIntegerTypes.UINT64_T: mclk_lower = "N/A" if mclk_upper == MaxUIntegerTypes.UINT64_T: mclk_upper = "N/A" return { "curr_sclk_range": { "lower_bound": sclk_lower, "upper_bound": sclk_upper }, "curr_mclk_range": { "lower_bound": mclk_lower, "upper_bound": mclk_upper }, "sclk_freq_limits": { "lower_bound": freq_data.sclk_freq_limits.lower_bound, "upper_bound": freq_data.sclk_freq_limits.upper_bound }, "mclk_freq_limits": { "lower_bound": freq_data.mclk_freq_limits.lower_bound, "upper_bound": freq_data.mclk_freq_limits.upper_bound }, "curve.vc_points": [ { "frequency": freq_data.curve.vc_points[0].frequency, "voltage": freq_data.curve.vc_points[0].voltage }, { "frequency": freq_data.curve.vc_points[1].frequency, "voltage": freq_data.curve.vc_points[1].voltage }, { "frequency": freq_data.curve.vc_points[2].frequency, "voltage": freq_data.curve.vc_points[2].voltage } ], "num_regions": freq_data.num_regions } def amdsmi_get_gpu_metrics_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) gpu_metrics = amdsmi_wrapper.amdsmi_gpu_metrics_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_info( processor_handle, ctypes.byref(gpu_metrics) ) ) gpu_metrics_output = { "common_header.structure_size": _validate_if_max_uint(gpu_metrics.common_header.structure_size, MaxUIntegerTypes.UINT16_T), "common_header.format_revision": _validate_if_max_uint(gpu_metrics.common_header.format_revision, MaxUIntegerTypes.UINT8_T), "common_header.content_revision": _validate_if_max_uint(gpu_metrics.common_header.content_revision, MaxUIntegerTypes.UINT8_T), "temperature_edge": _validate_if_max_uint(gpu_metrics.temperature_edge, MaxUIntegerTypes.UINT16_T), "temperature_hotspot": _validate_if_max_uint(gpu_metrics.temperature_hotspot, MaxUIntegerTypes.UINT16_T), "temperature_mem": _validate_if_max_uint(gpu_metrics.temperature_mem, MaxUIntegerTypes.UINT16_T), "temperature_vrgfx": _validate_if_max_uint(gpu_metrics.temperature_vrgfx, MaxUIntegerTypes.UINT16_T), "temperature_vrsoc": _validate_if_max_uint(gpu_metrics.temperature_vrsoc, MaxUIntegerTypes.UINT16_T), "temperature_vrmem": _validate_if_max_uint(gpu_metrics.temperature_vrmem, MaxUIntegerTypes.UINT16_T), "average_gfx_activity": _validate_if_max_uint(gpu_metrics.average_gfx_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_umc_activity": _validate_if_max_uint(gpu_metrics.average_umc_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_mm_activity": _validate_if_max_uint(gpu_metrics.average_mm_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_socket_power": _validate_if_max_uint(gpu_metrics.average_socket_power, MaxUIntegerTypes.UINT16_T), "energy_accumulator": _validate_if_max_uint(gpu_metrics.energy_accumulator, MaxUIntegerTypes.UINT64_T), "system_clock_counter": _validate_if_max_uint(gpu_metrics.system_clock_counter, MaxUIntegerTypes.UINT64_T), "average_gfxclk_frequency": _validate_if_max_uint(gpu_metrics.average_gfxclk_frequency, MaxUIntegerTypes.UINT16_T), "average_socclk_frequency": _validate_if_max_uint(gpu_metrics.average_socclk_frequency, MaxUIntegerTypes.UINT16_T), "average_uclk_frequency": _validate_if_max_uint(gpu_metrics.average_uclk_frequency, MaxUIntegerTypes.UINT16_T), "average_vclk0_frequency": _validate_if_max_uint(gpu_metrics.average_vclk0_frequency, MaxUIntegerTypes.UINT16_T), "average_dclk0_frequency": _validate_if_max_uint(gpu_metrics.average_dclk0_frequency, MaxUIntegerTypes.UINT16_T), "average_vclk1_frequency": _validate_if_max_uint(gpu_metrics.average_vclk1_frequency, MaxUIntegerTypes.UINT16_T), "average_dclk1_frequency": _validate_if_max_uint(gpu_metrics.average_dclk1_frequency, MaxUIntegerTypes.UINT16_T), "current_gfxclk": _validate_if_max_uint(gpu_metrics.current_gfxclk, MaxUIntegerTypes.UINT16_T), "current_socclk": _validate_if_max_uint(gpu_metrics.current_socclk, MaxUIntegerTypes.UINT16_T), "current_uclk": _validate_if_max_uint(gpu_metrics.current_uclk, MaxUIntegerTypes.UINT16_T), "current_vclk0": _validate_if_max_uint(gpu_metrics.current_vclk0, MaxUIntegerTypes.UINT16_T), "current_dclk0": _validate_if_max_uint(gpu_metrics.current_dclk0, MaxUIntegerTypes.UINT16_T), "current_vclk1": _validate_if_max_uint(gpu_metrics.current_vclk1, MaxUIntegerTypes.UINT16_T), "current_dclk1": _validate_if_max_uint(gpu_metrics.current_dclk1, MaxUIntegerTypes.UINT16_T), "throttle_status": _validate_if_max_uint(gpu_metrics.throttle_status, MaxUIntegerTypes.UINT32_T, isBool=True), "current_fan_speed": _validate_if_max_uint(gpu_metrics.current_fan_speed, MaxUIntegerTypes.UINT16_T), "pcie_link_width": _validate_if_max_uint(gpu_metrics.pcie_link_width, MaxUIntegerTypes.UINT16_T), "pcie_link_speed": _validate_if_max_uint(gpu_metrics.pcie_link_speed, MaxUIntegerTypes.UINT16_T), "gfx_activity_acc": _validate_if_max_uint(gpu_metrics.gfx_activity_acc, MaxUIntegerTypes.UINT32_T), "mem_activity_acc": _validate_if_max_uint(gpu_metrics.mem_activity_acc, MaxUIntegerTypes.UINT32_T), "temperature_hbm": _validate_if_max_uint(list(gpu_metrics.temperature_hbm), MaxUIntegerTypes.UINT16_T), "firmware_timestamp": _validate_if_max_uint(gpu_metrics.firmware_timestamp, MaxUIntegerTypes.UINT64_T), "voltage_soc": _validate_if_max_uint(gpu_metrics.voltage_soc, MaxUIntegerTypes.UINT16_T), "voltage_gfx": _validate_if_max_uint(gpu_metrics.voltage_gfx, MaxUIntegerTypes.UINT16_T), "voltage_mem": _validate_if_max_uint(gpu_metrics.voltage_mem, MaxUIntegerTypes.UINT16_T), "indep_throttle_status": _validate_if_max_uint(gpu_metrics.indep_throttle_status, MaxUIntegerTypes.UINT64_T, isBool=True), "current_socket_power": _validate_if_max_uint(gpu_metrics.current_socket_power, MaxUIntegerTypes.UINT16_T), "vcn_activity": _validate_if_max_uint(list(gpu_metrics.vcn_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), "gfxclk_lock_status": _validate_if_max_uint(gpu_metrics.gfxclk_lock_status, MaxUIntegerTypes.UINT32_T), "xgmi_link_width": _validate_if_max_uint(gpu_metrics.xgmi_link_width, MaxUIntegerTypes.UINT16_T), "xgmi_link_speed": _validate_if_max_uint(gpu_metrics.xgmi_link_speed, MaxUIntegerTypes.UINT16_T), "pcie_bandwidth_acc": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_acc, MaxUIntegerTypes.UINT64_T), "pcie_bandwidth_inst": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_inst, MaxUIntegerTypes.UINT64_T), "pcie_l0_to_recov_count_acc": _validate_if_max_uint(gpu_metrics.pcie_l0_to_recov_count_acc, MaxUIntegerTypes.UINT64_T), "pcie_replay_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_count_acc, MaxUIntegerTypes.UINT64_T), "pcie_replay_rover_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_rover_count_acc, MaxUIntegerTypes.UINT64_T), "xgmi_read_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_read_data_acc), MaxUIntegerTypes.UINT64_T), "xgmi_write_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_write_data_acc), MaxUIntegerTypes.UINT64_T), "current_gfxclks": _validate_if_max_uint(list(gpu_metrics.current_gfxclks), MaxUIntegerTypes.UINT16_T), "current_socclks": _validate_if_max_uint(list(gpu_metrics.current_socclks), MaxUIntegerTypes.UINT16_T), "current_vclk0s": _validate_if_max_uint(list(gpu_metrics.current_vclk0s), MaxUIntegerTypes.UINT16_T), "current_dclk0s": _validate_if_max_uint(list(gpu_metrics.current_dclk0s), MaxUIntegerTypes.UINT16_T), "jpeg_activity": _validate_if_max_uint(list(gpu_metrics.jpeg_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), "pcie_nak_sent_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_sent_count_acc, MaxUIntegerTypes.UINT32_T), "pcie_nak_rcvd_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_rcvd_count_acc, MaxUIntegerTypes.UINT32_T), "accumulation_counter": _validate_if_max_uint(gpu_metrics.accumulation_counter, MaxUIntegerTypes.UINT64_T), "prochot_residency_acc": _validate_if_max_uint(gpu_metrics.prochot_residency_acc, MaxUIntegerTypes.UINT64_T), "ppt_residency_acc": _validate_if_max_uint(gpu_metrics.ppt_residency_acc, MaxUIntegerTypes.UINT64_T), "socket_thm_residency_acc": _validate_if_max_uint(gpu_metrics.socket_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "vr_thm_residency_acc": _validate_if_max_uint(gpu_metrics.vr_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "hbm_thm_residency_acc": _validate_if_max_uint(gpu_metrics.hbm_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "num_partition": _validate_if_max_uint(gpu_metrics.num_partition, MaxUIntegerTypes.UINT16_T), "xcp_stats.gfx_busy_inst": list(gpu_metrics.xcp_stats), "xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats), "xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_ppt_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_thm_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_low_utilization_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_total_acc": list(gpu_metrics.xcp_stats), "pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T), "vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), "xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T), } # Create 2d array with each XCD's stats if 'xcp_stats.gfx_busy_inst' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_inst']): xcp_detail = [] for val in xcp_metrics.gfx_busy_inst: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT32_T, isActivity=True)) gpu_metrics_output['xcp_stats.gfx_busy_inst'][xcp_index] = xcp_detail if 'xcp_stats.jpeg_busy' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.jpeg_busy']): xcp_detail = [] for val in xcp_metrics.jpeg_busy: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) gpu_metrics_output['xcp_stats.jpeg_busy'][xcp_index] = xcp_detail if 'xcp_stats.vcn_busy' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.vcn_busy']): xcp_detail = [] for val in xcp_metrics.vcn_busy: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) gpu_metrics_output["xcp_stats.vcn_busy"][xcp_index] = xcp_detail if 'xcp_stats.gfx_busy_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_acc']): xcp_detail = [] for val in xcp_metrics.gfx_busy_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output["xcp_stats.gfx_busy_acc"][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc'][xcp_index] = xcp_detail # new for gpu metrics v1.8 if 'xcp_stats.gfx_below_host_limit_ppt_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_ppt_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_thm_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_thm_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_low_utilization_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_low_utilization_acc']): xcp_detail = [] for val in xcp_metrics.gfx_low_utilization_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_low_utilization_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_total_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_total_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc'][xcp_index] = xcp_detail return gpu_metrics_output def amdsmi_get_gpu_partition_metrics_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) gpu_metrics = amdsmi_wrapper.amdsmi_gpu_metrics_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_partition_metrics_info( processor_handle, ctypes.byref(gpu_metrics) ) ) gpu_metrics_output = { "common_header.structure_size": _validate_if_max_uint(gpu_metrics.common_header.structure_size, MaxUIntegerTypes.UINT16_T), "common_header.format_revision": _validate_if_max_uint(gpu_metrics.common_header.format_revision, MaxUIntegerTypes.UINT8_T), "common_header.content_revision": _validate_if_max_uint(gpu_metrics.common_header.content_revision, MaxUIntegerTypes.UINT8_T), "temperature_edge": _validate_if_max_uint(gpu_metrics.temperature_edge, MaxUIntegerTypes.UINT16_T), "temperature_hotspot": _validate_if_max_uint(gpu_metrics.temperature_hotspot, MaxUIntegerTypes.UINT16_T), "temperature_mem": _validate_if_max_uint(gpu_metrics.temperature_mem, MaxUIntegerTypes.UINT16_T), "temperature_vrgfx": _validate_if_max_uint(gpu_metrics.temperature_vrgfx, MaxUIntegerTypes.UINT16_T), "temperature_vrsoc": _validate_if_max_uint(gpu_metrics.temperature_vrsoc, MaxUIntegerTypes.UINT16_T), "temperature_vrmem": _validate_if_max_uint(gpu_metrics.temperature_vrmem, MaxUIntegerTypes.UINT16_T), "average_gfx_activity": _validate_if_max_uint(gpu_metrics.average_gfx_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_umc_activity": _validate_if_max_uint(gpu_metrics.average_umc_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_mm_activity": _validate_if_max_uint(gpu_metrics.average_mm_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), "average_socket_power": _validate_if_max_uint(gpu_metrics.average_socket_power, MaxUIntegerTypes.UINT16_T), "energy_accumulator": _validate_if_max_uint(gpu_metrics.energy_accumulator, MaxUIntegerTypes.UINT64_T), "system_clock_counter": _validate_if_max_uint(gpu_metrics.system_clock_counter, MaxUIntegerTypes.UINT64_T), "average_gfxclk_frequency": _validate_if_max_uint(gpu_metrics.average_gfxclk_frequency, MaxUIntegerTypes.UINT16_T), "average_socclk_frequency": _validate_if_max_uint(gpu_metrics.average_socclk_frequency, MaxUIntegerTypes.UINT16_T), "average_uclk_frequency": _validate_if_max_uint(gpu_metrics.average_uclk_frequency, MaxUIntegerTypes.UINT16_T), "average_vclk0_frequency": _validate_if_max_uint(gpu_metrics.average_vclk0_frequency, MaxUIntegerTypes.UINT16_T), "average_dclk0_frequency": _validate_if_max_uint(gpu_metrics.average_dclk0_frequency, MaxUIntegerTypes.UINT16_T), "average_vclk1_frequency": _validate_if_max_uint(gpu_metrics.average_vclk1_frequency, MaxUIntegerTypes.UINT16_T), "average_dclk1_frequency": _validate_if_max_uint(gpu_metrics.average_dclk1_frequency, MaxUIntegerTypes.UINT16_T), "current_gfxclk": _validate_if_max_uint(gpu_metrics.current_gfxclk, MaxUIntegerTypes.UINT16_T), "current_socclk": _validate_if_max_uint(gpu_metrics.current_socclk, MaxUIntegerTypes.UINT16_T), "current_uclk": _validate_if_max_uint(gpu_metrics.current_uclk, MaxUIntegerTypes.UINT16_T), "current_vclk0": _validate_if_max_uint(gpu_metrics.current_vclk0, MaxUIntegerTypes.UINT16_T), "current_dclk0": _validate_if_max_uint(gpu_metrics.current_dclk0, MaxUIntegerTypes.UINT16_T), "current_vclk1": _validate_if_max_uint(gpu_metrics.current_vclk1, MaxUIntegerTypes.UINT16_T), "current_dclk1": _validate_if_max_uint(gpu_metrics.current_dclk1, MaxUIntegerTypes.UINT16_T), "throttle_status": _validate_if_max_uint(gpu_metrics.throttle_status, MaxUIntegerTypes.UINT32_T, isBool=True), "current_fan_speed": _validate_if_max_uint(gpu_metrics.current_fan_speed, MaxUIntegerTypes.UINT16_T), "pcie_link_width": _validate_if_max_uint(gpu_metrics.pcie_link_width, MaxUIntegerTypes.UINT16_T), "pcie_link_speed": _validate_if_max_uint(gpu_metrics.pcie_link_speed, MaxUIntegerTypes.UINT16_T), "gfx_activity_acc": _validate_if_max_uint(gpu_metrics.gfx_activity_acc, MaxUIntegerTypes.UINT32_T), "mem_activity_acc": _validate_if_max_uint(gpu_metrics.mem_activity_acc, MaxUIntegerTypes.UINT32_T), "temperature_hbm": _validate_if_max_uint(list(gpu_metrics.temperature_hbm), MaxUIntegerTypes.UINT16_T), "firmware_timestamp": _validate_if_max_uint(gpu_metrics.firmware_timestamp, MaxUIntegerTypes.UINT64_T), "voltage_soc": _validate_if_max_uint(gpu_metrics.voltage_soc, MaxUIntegerTypes.UINT16_T), "voltage_gfx": _validate_if_max_uint(gpu_metrics.voltage_gfx, MaxUIntegerTypes.UINT16_T), "voltage_mem": _validate_if_max_uint(gpu_metrics.voltage_mem, MaxUIntegerTypes.UINT16_T), "indep_throttle_status": _validate_if_max_uint(gpu_metrics.indep_throttle_status, MaxUIntegerTypes.UINT64_T, isBool=True), "current_socket_power": _validate_if_max_uint(gpu_metrics.current_socket_power, MaxUIntegerTypes.UINT16_T), "vcn_activity": _validate_if_max_uint(list(gpu_metrics.vcn_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), "gfxclk_lock_status": _validate_if_max_uint(gpu_metrics.gfxclk_lock_status, MaxUIntegerTypes.UINT32_T), "xgmi_link_width": _validate_if_max_uint(gpu_metrics.xgmi_link_width, MaxUIntegerTypes.UINT16_T), "xgmi_link_speed": _validate_if_max_uint(gpu_metrics.xgmi_link_speed, MaxUIntegerTypes.UINT16_T), "pcie_bandwidth_acc": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_acc, MaxUIntegerTypes.UINT64_T), "pcie_bandwidth_inst": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_inst, MaxUIntegerTypes.UINT64_T), "pcie_l0_to_recov_count_acc": _validate_if_max_uint(gpu_metrics.pcie_l0_to_recov_count_acc, MaxUIntegerTypes.UINT64_T), "pcie_replay_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_count_acc, MaxUIntegerTypes.UINT64_T), "pcie_replay_rover_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_rover_count_acc, MaxUIntegerTypes.UINT64_T), "xgmi_read_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_read_data_acc), MaxUIntegerTypes.UINT64_T), "xgmi_write_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_write_data_acc), MaxUIntegerTypes.UINT64_T), "current_gfxclks": _validate_if_max_uint(list(gpu_metrics.current_gfxclks), MaxUIntegerTypes.UINT16_T), "current_socclks": _validate_if_max_uint(list(gpu_metrics.current_socclks), MaxUIntegerTypes.UINT16_T), "current_vclk0s": _validate_if_max_uint(list(gpu_metrics.current_vclk0s), MaxUIntegerTypes.UINT16_T), "current_dclk0s": _validate_if_max_uint(list(gpu_metrics.current_dclk0s), MaxUIntegerTypes.UINT16_T), "jpeg_activity": _validate_if_max_uint(list(gpu_metrics.jpeg_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), "pcie_nak_sent_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_sent_count_acc, MaxUIntegerTypes.UINT32_T), "pcie_nak_rcvd_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_rcvd_count_acc, MaxUIntegerTypes.UINT32_T), "accumulation_counter": _validate_if_max_uint(gpu_metrics.accumulation_counter, MaxUIntegerTypes.UINT64_T), "prochot_residency_acc": _validate_if_max_uint(gpu_metrics.prochot_residency_acc, MaxUIntegerTypes.UINT64_T), "ppt_residency_acc": _validate_if_max_uint(gpu_metrics.ppt_residency_acc, MaxUIntegerTypes.UINT64_T), "socket_thm_residency_acc": _validate_if_max_uint(gpu_metrics.socket_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "vr_thm_residency_acc": _validate_if_max_uint(gpu_metrics.vr_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "hbm_thm_residency_acc": _validate_if_max_uint(gpu_metrics.hbm_thm_residency_acc, MaxUIntegerTypes.UINT64_T), "num_partition": _validate_if_max_uint(gpu_metrics.num_partition, MaxUIntegerTypes.UINT16_T), "xcp_stats.gfx_busy_inst": list(gpu_metrics.xcp_stats), "xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats), "xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_ppt_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_thm_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_low_utilization_acc": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_below_host_limit_total_acc": list(gpu_metrics.xcp_stats), "pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T), "vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), "xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T), } # Create 2d array with each XCD's stats if 'xcp_stats.gfx_busy_inst' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_inst']): xcp_detail = [] for val in xcp_metrics.gfx_busy_inst: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT32_T, isActivity=True)) gpu_metrics_output['xcp_stats.gfx_busy_inst'][xcp_index] = xcp_detail if 'xcp_stats.jpeg_busy' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.jpeg_busy']): xcp_detail = [] for val in xcp_metrics.jpeg_busy: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) gpu_metrics_output['xcp_stats.jpeg_busy'][xcp_index] = xcp_detail if 'xcp_stats.vcn_busy' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.vcn_busy']): xcp_detail = [] for val in xcp_metrics.vcn_busy: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) gpu_metrics_output["xcp_stats.vcn_busy"][xcp_index] = xcp_detail if 'xcp_stats.gfx_busy_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_acc']): xcp_detail = [] for val in xcp_metrics.gfx_busy_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output["xcp_stats.gfx_busy_acc"][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc'][xcp_index] = xcp_detail # new for gpu metrics v1.8 if 'xcp_stats.gfx_below_host_limit_ppt_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_ppt_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_thm_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_thm_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_low_utilization_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_low_utilization_acc']): xcp_detail = [] for val in xcp_metrics.gfx_low_utilization_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_low_utilization_acc'][xcp_index] = xcp_detail if 'xcp_stats.gfx_below_host_limit_total_acc' in gpu_metrics_output: for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc']): xcp_detail = [] for val in xcp_metrics.gfx_below_host_limit_total_acc: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc'][xcp_index] = xcp_detail return gpu_metrics_output def amdsmi_get_gpu_od_volt_curve_regions( processor_handle: processor_handle_t, num_regions: int ) -> List[Dict[str, Any]]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(num_regions, int): raise AmdSmiParameterException(num_regions, int) region_count = ctypes.c_uint32(num_regions) buffer = (amdsmi_wrapper.amdsmi_freq_volt_region_t * num_regions)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_od_volt_curve_regions( processor_handle, ctypes.byref(region_count), buffer ) ) result = [] for index in range(region_count.value): result.extend( [ { "freq_range": { "lower_bound": buffer[index].freq_range.lower_bound, "upper_bound": buffer[index].freq_range.upper_bound, }, "volt_range": { "lower_bound": buffer[index].volt_range.lower_bound, "upper_bound": buffer[index].volt_range.upper_bound, }, } ] ) return result def amdsmi_get_gpu_power_profile_presets( processor_handle: processor_handle_t, sensor_idx: int ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(sensor_idx, int): raise AmdSmiParameterException(sensor_idx, int) status = amdsmi_wrapper.amdsmi_power_profile_status_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_power_profile_presets( processor_handle, sensor_idx, ctypes.byref(status) ) ) return { "available_profiles": status.available_profiles, "current": status.current, "num_profiles": status.num_profiles, } def amdsmi_get_gpu_ecc_count( processor_handle: processor_handle_t, block: AmdSmiGpuBlock ) -> Dict[str, int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(block, AmdSmiGpuBlock): raise AmdSmiParameterException(block, AmdSmiGpuBlock) ec = amdsmi_wrapper.amdsmi_error_count_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_ecc_count( processor_handle, block, ctypes.byref(ec)) ) return { "correctable_count": ec.correctable_count, "uncorrectable_count": ec.uncorrectable_count, "deferred_count": ec.deferred_count, } def amdsmi_get_gpu_ecc_enabled( processor_handle: processor_handle_t, ) -> int: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) blocks = ctypes.c_uint64(0) _check_res( amdsmi_wrapper.amdsmi_get_gpu_ecc_enabled( processor_handle, ctypes.byref(blocks)) ) return blocks.value def amdsmi_get_gpu_ecc_status( processor_handle: processor_handle_t, block: AmdSmiGpuBlock ) -> AmdSmiRasErrState: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(block, AmdSmiGpuBlock): raise AmdSmiParameterException(block, AmdSmiGpuBlock) state = amdsmi_wrapper.amdsmi_ras_err_state_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_ecc_status( processor_handle, block, ctypes.byref(state) ) ) return AmdSmiRasErrState(state.value) def amdsmi_status_code_to_string(status: amdsmi_wrapper.amdsmi_status_t) -> Union[str, bytes, None]: if not isinstance(status, amdsmi_wrapper.amdsmi_status_t): raise AmdSmiParameterException(status, amdsmi_wrapper.amdsmi_status_t) status_string_p_p = ctypes.pointer(ctypes.pointer(ctypes.c_char())) _check_res(amdsmi_wrapper.amdsmi_status_code_to_string( status, status_string_p_p)) return amdsmi_wrapper.string_cast(status_string_p_p.contents) def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]: num_items = ctypes.c_uint32(0) nullptr = POINTER(amdsmi_wrapper.amdsmi_process_info_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_process_info( nullptr, ctypes.byref(num_items)) ) procs = (amdsmi_wrapper.amdsmi_process_info_t * num_items.value)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_process_info( procs, ctypes.byref(num_items)) ) return [ { "process_id": proc.process_id, "vram_usage": proc.vram_usage, "sdma_usage": proc.sdma_usage, "cu_occupancy": proc.cu_occupancy, "evicted_time": proc.evicted_time, } for proc in procs ] def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]: if not isinstance(pid, int): raise AmdSmiParameterException(pid, int) proc = amdsmi_wrapper.amdsmi_process_info_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_process_info_by_pid( ctypes.c_uint32(pid), ctypes.byref(proc) ) ) return { "process_id": proc.process_id, "vram_usage": proc.vram_usage, "sdma_usage": proc.sdma_usage, "cu_occupancy": proc.cu_occupancy, "evicted_time": proc.evicted_time, } def amdsmi_get_gpu_compute_process_gpus(pid: int) -> List[int]: if not isinstance(pid, int): raise AmdSmiParameterException(pid, int) num_devices = ctypes.c_uint32(0) nullptr = POINTER(ctypes.c_uint32)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_process_gpus( pid, nullptr, ctypes.byref(num_devices) ) ) dv_indices = (ctypes.c_uint32 * num_devices.value)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_compute_process_gpus( pid, dv_indices, ctypes.byref(num_devices) ) ) return [dv_index.value for dv_index in dv_indices] def amdsmi_gpu_xgmi_error_status( processor_handle: processor_handle_t, ) -> AmdSmiXgmiStatus: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) status = amdsmi_wrapper.amdsmi_xgmi_status_t() _check_res( amdsmi_wrapper.amdsmi_gpu_xgmi_error_status( processor_handle, ctypes.byref(status)) ) #return AmdSmiXgmiStatus(status.value).value return AmdSmiXgmiStatus(status.value) def amdsmi_reset_gpu_xgmi_error( processor_handle: processor_handle_t, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res(amdsmi_wrapper.amdsmi_reset_gpu_xgmi_error(processor_handle)) def amdsmi_get_gpu_memory_reserved_pages( processor_handle: processor_handle_t, ) -> Union[list, str]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) num_pages = ctypes.c_uint32() nullptr = POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( processor_handle, ctypes.byref(num_pages), nullptr ) ) if num_pages.value == 0: return [] mem_reserved_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( processor_handle, ctypes.byref(num_pages), mem_reserved_pages ) ) return _format_bad_page_info(mem_reserved_pages, num_pages) def amdsmi_get_gpu_metrics_header_info( processor_handle: processor_handle_t, ) -> Dict[str, int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) header_info = amdsmi_wrapper.amd_metrics_table_header_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_header_info( processor_handle, ctypes.byref(header_info) ) ) return { "structure_size": header_info.structure_size, "format_revision": header_info.format_revision, "content_revision": header_info.content_revision } def amdsmi_get_link_topology_nearest( processor_handle: processor_handle_t, link_type: AmdSmiLinkType, )-> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(link_type, AmdSmiLinkType): raise AmdSmiParameterException(link_type, AmdSmiLinkType) topology_nearest_list = amdsmi_wrapper.amdsmi_topology_nearest_t() _check_res( amdsmi_wrapper.amdsmi_get_link_topology_nearest( processor_handle, link_type, ctypes.byref(topology_nearest_list) ) ) device_list = [] for index in range(topology_nearest_list.count): device_list.append(topology_nearest_list.processor_list[index]) return { 'processor_list': device_list } def amdsmi_get_gpu_virtualization_mode( processor_handle: processor_handle_t ) -> Dict[str, AmdSmiVirtualizationMode]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) # make info struct here mode = amdsmi_wrapper.amdsmi_virtualization_mode_t() # call lib function here _check_res( amdsmi_wrapper.amdsmi_get_gpu_virtualization_mode( processor_handle, ctypes.byref(mode) ) ) return { "mode": AmdSmiVirtualizationMode(mode.value) } def amdsmi_get_gpu_ptl_state( processor_handle: processor_handle_t ) -> bool: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) is_ptl_enabled = ctypes.c_bool() _check_res( amdsmi_wrapper.amdsmi_get_gpu_ptl_state( processor_handle, ctypes.byref(is_ptl_enabled) ) ) return is_ptl_enabled.value def amdsmi_set_gpu_ptl_state( processor_handle: processor_handle_t, state: int ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res( amdsmi_wrapper.amdsmi_set_gpu_ptl_state( processor_handle, state ) ) def amdsmi_get_gpu_ptl_formats( processor_handle: processor_handle_t ) -> Tuple[int, int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) data_format1 = amdsmi_wrapper.amdsmi_ptl_data_format_t() data_format2 = amdsmi_wrapper.amdsmi_ptl_data_format_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_ptl_formats( processor_handle, ctypes.byref(data_format1), ctypes.byref(data_format2) ) ) return int(data_format1.value), int(data_format2.value) def amdsmi_set_gpu_ptl_formats( processor_handle: processor_handle_t, fmt1: AmdSmiPtlData, fmt2: AmdSmiPtlData, ) -> None: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) for fmt in (fmt1, fmt2): if not isinstance(fmt, AmdSmiPtlData): raise AmdSmiParameterException(fmt, AmdSmiPtlData) if fmt is AmdSmiPtlData.INVALID: raise AmdSmiParameterException(fmt, "A valid PTL data format (not INVALID)") c_fmt1 = amdsmi_wrapper.amdsmi_ptl_data_format_t(int(fmt1)) c_fmt2 = amdsmi_wrapper.amdsmi_ptl_data_format_t(int(fmt2)) _check_res( amdsmi_wrapper.amdsmi_set_gpu_ptl_formats( processor_handle, c_fmt1, c_fmt2) ) ### Non C-Lib APIs ### def amdsmi_get_rocm_version()-> Tuple[bool, str]: """ Get the ROCm version for the rocm-core library. This function attempts to retrieve the ROCm version by loading the `librocm-core.so` shared library and calling its `getROCmVersion` function. The version is returned as a string in the format "major.minor.patch". Returns: Tuple[bool, str]: A tuple containing a boolean and a string. - The boolean indicates whether the operation was successful. - The string contains the ROCm version if successful, or an error message if not. Raises: Exception: If there is an error loading the shared library or calling the function. Example: rocm_lib_status, version_message = amdsmi_get_rocm_version() if rocm_lib_status: print(f"ROCm version: {version_message}") else: print(f"Error: {version_message}") """ # librocm-core.so can be located in found using several different methods. # Look for it with below priority: # 1. ROCM_HOME/ROCM_PATH environment variables # - ROCM_HOME/lib # - ROCM_PATH/lib (usually set to /opt/rocm/) # 2. Decided by the linker # - LD_LIBRARY_PATH env var # - defined path in /etc/ld.so.conf.d/ # 3. Relative to amdsmi_wrapper.py in /opt/rocm/share/amd_smi # - parent directory try: possible_locations = list() # 0. Relative to amdsmi_interface.py in TheRock: # `amdsmi_interface.py` is located in # `_rocm_sdk_core/share/amd_smi/amdsmi`, libraries are in # `_rocm_sdk_core/lib`. librocm_core_path = Path(__file__).resolve().parent.parent.parent.parent / "lib/librocm-core.so.1" possible_locations.append(librocm_core_path) # 1. rocm_path = os.getenv("ROCM_HOME", os.getenv("ROCM_PATH")) if rocm_path: possible_locations.append(os.path.join(rocm_path, "lib/librocm-core.so")) # Check if /opt/rocm/lib/librocm-core.so exists and add it to the list if os.path.exists("/opt/rocm/lib/librocm-core.so"): possible_locations.append("/opt/rocm/lib/librocm-core.so") # 2. possible_locations.append("librocm-core.so") # 3. librocm_core_parent_dir = Path(__file__).resolve().parent.parent.parent / "lib" / "librocm-core.so" possible_locations.append(librocm_core_parent_dir) for librocm_core_file_path in possible_locations: try: librocm_core = ctypes.CDLL(librocm_core_file_path) VerErrors = ctypes.c_uint32 get_rocm_core_version = librocm_core.getROCmVersion get_rocm_core_version.restype = VerErrors get_rocm_core_version.argtypes = [POINTER(ctypes.c_uint32), POINTER(ctypes.c_uint32),POINTER(ctypes.c_uint32)] # call the function major = ctypes.c_uint32() minor = ctypes.c_uint32() patch = ctypes.c_uint32() if get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),ctypes.byref(patch)) == 0: return True, f"{major.value}.{minor.value}.{patch.value}" else: return False, "Failed to unpack ROCm version" except OSError: continue # If we hit here, we were unable to find the librocm-core.so file return False, "Could not find librocm-core.so" except Exception as e: return False, f"Unable to detect ROCm installation, Unknown Error: {e}" def amdsmi_get_cpu_handles() -> Dict[str, Any]: cpu_handles = amdsmi_get_cpusocket_handles() return { 'cpu_count': len(cpu_handles), 'processor_handles': cpu_handles } def amdsmi_get_esmi_err_msg(status: AmdSmiStatus) -> str: if not isinstance(status, AmdSmiStatus): raise AmdSmiParameterException(status, AmdSmiStatus) # Create a pointer to a pointer to char (char**) status_string_p_p = ctypes.pointer(ctypes.pointer(ctypes.c_char())) _check_res( amdsmi_wrapper.amdsmi_get_esmi_err_msg( status.value, status_string_p_p ) ) # Use string_cast helper function if available in wrapper if not status_string_p_p.contents: pass elif hasattr(amdsmi_wrapper, 'string_cast'): error_msg = amdsmi_wrapper.string_cast(status_string_p_p.contents) if isinstance(error_msg, str): return error_msg else: # Manual string extraction error_msg = ctypes.string_at(status_string_p_p.contents).decode('utf-8') return error_msg return "Unknown error" def amdsmi_get_gpu_event_notification( timeout_ms: int = 1000 ) -> Dict[str, Any]: if not isinstance(timeout_ms, int): raise AmdSmiParameterException(timeout_ms, int) # Convert timeout to C type timeout_ms_c = ctypes.c_int32(timeout_ms) # Initialize output parameters num_elem = ctypes.c_uint32(MAX_NUM_PROCESSES) num_elem_p = ctypes.pointer(num_elem) # Create array for event notification data data_array = (amdsmi_wrapper.amdsmi_evt_notification_data_t * MAX_NUM_PROCESSES)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_event_notification( timeout_ms_c, num_elem_p, data_array ) ) results = [] for i in range(num_elem_p.contents.value): entry = { 'processor_handle': data_array[i].processor_handle, 'event': data_array[i].event, 'message': data_array[i].message.decode('utf-8') if data_array[i].message else "" } results.append(entry) result = { 'num_elem': num_elem_p.contents.value, 'data': results } return result def amdsmi_get_gpu_revision(processor_handle: processor_handle_t) -> str: """ Get the GPU revision for a given processor handle. Parameters: processor_handle (amdsmi_processor_handle): The processor handle for the GPU. Returns: str: The GPU revision as a string. Raises: AmdSmiParameterException: If the processor handle is invalid. AmdSmiLibraryException: If the underlying library call fails. """ if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) revision_16 = ctypes.c_uint16() _check_res(amdsmi_wrapper.amdsmi_get_gpu_revision(processor_handle, ctypes.byref(revision_16))) return _pad_hex_value(hex(revision_16.value), 2) def amdsmi_get_processor_count_from_handles(processor_handles_list) -> Dict[str, int]: if not isinstance(processor_handles_list, list): raise AmdSmiParameterException(processor_handles_list, list) # Convert Python list to C array processor_count = len(processor_handles_list) processor_handles_array = (amdsmi_wrapper.amdsmi_processor_handle * processor_count)() for i, handle in enumerate(processor_handles_list): processor_handles_array[i] = handle processor_count_p = ctypes.pointer(ctypes.c_uint32(processor_count)) nr_cpusockets = ctypes.pointer(ctypes.c_uint32(0)) nr_cpucores = ctypes.pointer(ctypes.c_uint32(0)) nr_gpus = ctypes.pointer(ctypes.c_uint32(0)) _check_res( amdsmi_wrapper.amdsmi_get_processor_count_from_handles( processor_handles_array, processor_count_p, nr_cpusockets, nr_cpucores, nr_gpus ) ) return { 'nr_cpusockets': nr_cpusockets.contents.value, 'nr_cpucores': nr_cpucores.contents.value, 'nr_gpus': nr_gpus.contents.value } def amdsmi_get_processor_handles_by_type(socket_handle: socket_handle_t, processor_type: AmdSmiProcessorType): if not isinstance(socket_handle, amdsmi_wrapper.amdsmi_socket_handle): raise AmdSmiParameterException(socket_handle, amdsmi_wrapper.amdsmi_socket_handle) if not isinstance(processor_type, AmdSmiProcessorType): raise AmdSmiParameterException(processor_type, AmdSmiProcessorType) processor_handles = (amdsmi_wrapper.amdsmi_processor_handle * MAX_NUM_PROCESSES)() processor_count = ctypes.c_uint32(0) ptr_processor_count = ctypes.pointer(processor_count) _check_res( amdsmi_wrapper.amdsmi_get_processor_handles_by_type( socket_handle, processor_type, processor_handles, ptr_processor_count ) ) entry = [] for i in range(ptr_processor_count.contents.value): entry.append(processor_handles[i]) return { 'processor_handles': entry, 'processor_count': ptr_processor_count.contents.value } def amdsmi_gpu_validate_ras_eeprom(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res(amdsmi_wrapper.amdsmi_gpu_validate_ras_eeprom(processor_handle)) def amdsmi_init_gpu_event_notification(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res(amdsmi_wrapper.amdsmi_init_gpu_event_notification(processor_handle)) def amdsmi_set_gpu_event_notification_mask(processor_handle: processor_handle_t, mask: int): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) if not isinstance(mask, int): raise AmdSmiParameterException(mask, int) mask_64 = ctypes.c_uint64(mask) _check_res(amdsmi_wrapper.amdsmi_set_gpu_event_notification_mask(processor_handle, mask_64)) def amdsmi_stop_gpu_event_notification( processor_handle: processor_handle_t ): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) _check_res(amdsmi_wrapper.amdsmi_stop_gpu_event_notification(processor_handle)) def amdsmi_get_gpu_busy_percent(processor_handle: processor_handle_t): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) gpu_busy_percent = ctypes.c_uint32(0) _check_res(amdsmi_wrapper.amdsmi_get_gpu_busy_percent(processor_handle, ctypes.byref(gpu_busy_percent))) return gpu_busy_percent.value