Files
rocm-systems/rocm_smi/src/rocm_smi_device.cc
T
Mario Limonciello (AMD) 924a06d1e1 Remove unnecessary includes
Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
2025-09-05 17:44:17 -05:00

1997 خطوط
73 KiB
C++
Executable File

/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <pthread.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <sstream>
#include <string>
#include <vector>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "shared_mutex.h" // NOLINT
namespace amd {
namespace smi {
// Debug root file path
static const char *kPathDebugRootFName = "/sys/kernel/debug/dri/";
// Device debugfs file names
static const char *kDevGpuResetFName = "amdgpu_gpu_recover";
// PCI sysfs file names
static const char *kDevPCieVendorIDFName = "vendor";
// Device sysfs file names
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevSocPstateFName = "pm_policy/soc_pstate";
static const char *kDevXgmiPlpdFName = "pm_policy/xgmi_plpd";
static const char *kDevProcessIsolationFName = "enforce_isolation";
static const char *kDevShaderCleanFName = "run_cleaner_shader";
static const char *kDevDevProdNameFName = "product_name";
static const char *kDevDevProdNumFName = "product_number";
static const char *kDevDevIDFName = "device";
static const char* kDevXGMIPhysicalIDFName = "xgmi_physical_id";
static const char *kDevXGMIPortNumFName = "xgmi_port_num";
static const char *kDevDevRevIDFName = "revision";
static const char *kDevVendorIDFName = "vendor";
static const char *kDevBoardInfoFName = "board_info";
static const char *kDevSubSysDevIDFName = "subsystem_device";
static const char *kDevSubSysVendorIDFName = "subsystem_vendor";
static const char *kDevOverDriveLevelFName = "pp_sclk_od";
static const char *kDevMemOverDriveLevelFName = "pp_mclk_od";
static const char *kDevGPUSClkFName = "pp_dpm_sclk";
static const char *kDevGPUMClkFName = "pp_dpm_mclk";
static const char *kDevDCEFClkFName = "pp_dpm_dcefclk";
static const char *kDevFClkFName = "pp_dpm_fclk";
static const char *kDevSOCClkFName = "pp_dpm_socclk";
static const char *kDevPCIEClkFName = "pp_dpm_pcie";
static const char *kDevPowerProfileModeFName = "pp_power_profile_mode";
static const char *kDevPowerODVoltageFName = "pp_od_clk_voltage";
static const char *kDevUsageFName = "gpu_busy_percent";
static const char *kDevVBiosVerFName = "vbios_version";
static const char *kDevPCIEThruPutFName = "pcie_bw";
static const char *kDevErrCntSDMAFName = "ras/sdma_err_count";
static const char *kDevErrCntUMCFName = "ras/umc_err_count";
static const char *kDevErrCntGFXFName = "ras/gfx_err_count";
static const char *kDevErrCntMMHUBFName = "ras/mmhub_err_count";
static const char *kDevErrCntPCIEBIFFName = "ras/pcie_bif_err_count";
static const char *kDevErrCntHDPFName = "ras/hdp_err_count";
static const char *kDevErrCntXGMIWAFLFName = "ras/xgmi_wafl_err_count";
static const char *kDevErrCntFeaturesFName = "ras/features";
static const char *kDevErrRASSchemaFName = "ras/schema";
static const char *kDevErrTableVersionFName = "ras/version";
static const char *kDevMemPageBadFName = "ras/gpu_vram_bad_pages";
static const char *kDevMemTotGTTFName = "mem_info_gtt_total";
static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total";
static const char *kDevMemTotVRAMFName = "mem_info_vram_total";
static const char *kDevMemUsedGTTFName = "mem_info_gtt_used";
static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used";
static const char *kDevMemUsedVRAMFName = "mem_info_vram_used";
static const char *kDevVramVendorFName = "mem_info_vram_vendor";
static const char *kDevPCIEReplayCountFName = "pcie_replay_count";
static const char *kDevUniqueIdFName = "unique_id";
static const char *kDevDFCountersAvailableFName = "df_cntr_avail";
static const char *kDevMemBusyPercentFName = "mem_busy_percent";
static const char *kDevXGMIErrorFName = "xgmi_error";
static const char *kDevSerialNumberFName = "serial_number";
static const char *kDevNumaNodeFName = "numa_node";
static const char *kDevGpuMetricsFName = "gpu_metrics";
static const char *kDevPmMetricsFName = "pm_metrics"; // PM log
static const char *kDevRegMetricsFName = "reg_state"; // register table
static const char *kDevBaseBoardTempMetricsFName = "board/baseboard_temp";
static const char *kDevGpuBoardTempMetricsFName = "board/gpuboard_temp";
static const char *kDevAvailableComputePartitionFName =
"available_compute_partition";
static const char *kDevComputePartitionFName = "current_compute_partition";
static const char *kDevMemoryPartitionFName = "current_memory_partition";
static const char *kDevAvailableMemoryPartitionFName = "available_memory_partition";
static const char *kDevSupportedXcpConfigsFName = "compute_partition_config/supported_xcp_configs";
static const char *kDevSupportedNpsConfigsFName = "compute_partition_config/supported_nps_configs";
static const char *kDevXcpConfigFName = "compute_partition_config/xcp_config";
// XCP config resource files - not every file will exist in all ASICs (ex. Decoders vs Encoders)
static const char *kDevDecoderInstFName = "compute_partition_config/dec/num_inst";
static const char *kDevDecoderSharedFName = "compute_partition_config/dec/num_shared";
static const char *kDevEncoderInstFName = "compute_partition_config/enc/num_inst";
static const char *kDevEncoderSharedFName = "compute_partition_config/enc/num_shared";
static const char *kDevDmaInstFName = "compute_partition_config/dma/num_inst";
static const char *kDevDmaSharedFName = "compute_partition_config/dma/num_shared";
static const char *kDevJpegInstFName = "compute_partition_config/jpeg/num_inst";
static const char *kDevJpegSharedFName = "compute_partition_config/jpeg/num_shared";
static const char *kDevXccInstFName = "compute_partition_config/xcc/num_inst";
static const char *kDevXccSharedFName = "compute_partition_config/xcc/num_shared";
// Firmware version files
static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version";
static const char *kDevFwVersionCeFName = "fw_version/ce_fw_version";
static const char *kDevFwVersionDmcuFName = "fw_version/dmcu_fw_version";
static const char *kDevFwVersionMcFName = "fw_version/mc_fw_version";
static const char *kDevFwVersionMeFName = "fw_version/me_fw_version";
static const char *kDevFwVersionMecFName = "fw_version/mec_fw_version";
static const char *kDevFwVersionMec2FName = "fw_version/mec2_fw_version";
static const char *kDevFwVersionMesFName = "fw_version/mes_fw_version";
static const char *kDevFwVersionMesKiqFName = "fw_version/mes_kiq_fw_version";
static const char *kDevFwVersionPfpFName = "fw_version/pfp_fw_version";
static const char *kDevFwVersionRlcFName = "fw_version/rlc_fw_version";
static const char *kDevFwVersionRlcSrlcFName = "fw_version/rlc_srlc_fw_version";
static const char *kDevFwVersionRlcSrlgFName = "fw_version/rlc_srlg_fw_version";
static const char *kDevFwVersionRlcSrlsFName = "fw_version/rlc_srls_fw_version";
static const char *kDevFwVersionSdmaFName = "fw_version/sdma_fw_version";
static const char *kDevFwVersionSdma2FName = "fw_version/sdma2_fw_version";
static const char *kDevFwVersionSmcFName = "fw_version/smc_fw_version";
static const char *kDevFwVersionSosFName = "fw_version/sos_fw_version";
static const char *kDevFwVersionTaRasFName = "fw_version/ta_ras_fw_version";
static const char *kDevFwVersionTaXgmiFName = "fw_version/ta_xgmi_fw_version";
static const char *kDevFwVersionUvdFName = "fw_version/uvd_fw_version";
static const char *kDevFwVersionVceFName = "fw_version/vce_fw_version";
static const char *kDevFwVersionVcnFName = "fw_version/vcn_fw_version";
static const char *kDevFwVersionPldmBundleFName = "fw_version/pldm_fw_version";
static const char *kDevKFDNodePropCachesCntSName = "caches_count";
static const char *kDevKFDNodePropIoLinksCntSName = "io_links_count";
static const char *kDevKFDNodePropCPUCoreIdBaseSName = "cpu_core_id_base";
static const char *kDevKFDNodePropSimdIdBaseSName = "simd_id_base";
static const char *kDevKFDNodePropMaxWavePerSimdSName = "max_waves_per_simd";
static const char *kDevKFDNodePropLdsSzSName = "lds_size_in_kb";
static const char *kDevKFDNodePropGdsSzSName = "gds_size_in_kb";
static const char *kDevKFDNodePropNumGWSSName = "num_gws";
static const char *kDevKFDNodePropWaveFrontSizeSName = "wave_front_size";
static const char *kDevKFDNodePropArrCntSName = "array_count";
static const char *kDevKFDNodePropSimdArrPerEngSName = "simd_arrays_per_engine";
static const char *kDevKFDNodePropCuPerSimdArrSName = "cu_per_simd_array";
static const char *kDevKFDNodePropSimdPerCUSName = "simd_per_cu";
static const char *kDevKFDNodePropMaxSlotsScratchCuSName =
"max_slots_scratch_cu";
static const char *kDevKFDNodePropVendorIdSName = "vendor_id";
static const char *kDevKFDNodePropDeviceIdSName = "device_id";
static const char *kDevKFDNodePropLocationIdSName = "location_id";
static const char *kDevKFDNodePropDrmRenderMinorSName = "drm_render_minor";
static const char *kDevKFDNodePropHiveIdSName = "hive_id";
static const char *kDevKFDNodePropNumSdmaEnginesSName = "num_sdma_engines";
static const char *kDevKFDNodePropNumSdmaXgmiEngsSName =
"num_sdma_xgmi_engines";
static const char *kDevKFDNodePropMaxEngClkFCompSName =
"max_engine_clk_fcompute";
static const char *kDevKFDNodePropLocMemSzSName = "local_mem_size";
static const char *kDevKFDNodePropFwVerSName = "fw_version";
static const char *kDevKFDNodePropCapabilitySName = "capability";
static const char *kDevKFDNodePropDbgPropSName = "debug_prop";
static const char *kDevKFDNodePropSdmaFwVerSName = "sdma_fw_version";
static const char *kDevKFDNodePropMaxEngClkCCompSName =
"max_engine_clk_ccompute";
static const char *kDevKFDNodePropDomainSName = "domain";
static const std::map<DevKFDNodePropTypes, const char *> kDevKFDPropNameMap = {
{kDevKFDNodePropCachesCnt, kDevKFDNodePropCachesCntSName},
{kDevKFDNodePropIoLinksCnt, kDevKFDNodePropIoLinksCntSName},
{kDevKFDNodePropCPUCoreIdBase, kDevKFDNodePropCPUCoreIdBaseSName},
{kDevKFDNodePropSimdIdBase, kDevKFDNodePropSimdIdBaseSName},
{kDevKFDNodePropMaxWavePerSimd, kDevKFDNodePropMaxWavePerSimdSName},
{kDevKFDNodePropLdsSz, kDevKFDNodePropLdsSzSName},
{kDevKFDNodePropGdsSz, kDevKFDNodePropGdsSzSName},
{kDevKFDNodePropNumGWS, kDevKFDNodePropNumGWSSName},
{kDevKFDNodePropWaveFrontSize, kDevKFDNodePropWaveFrontSizeSName},
{kDevKFDNodePropArrCnt, kDevKFDNodePropArrCntSName},
{kDevKFDNodePropSimdArrPerEng, kDevKFDNodePropSimdArrPerEngSName},
{kDevKFDNodePropCuPerSimdArr, kDevKFDNodePropCuPerSimdArrSName},
{kDevKFDNodePropSimdPerCU, kDevKFDNodePropSimdPerCUSName},
{kDevKFDNodePropMaxSlotsScratchCu, kDevKFDNodePropMaxSlotsScratchCuSName},
{kDevKFDNodePropVendorId, kDevKFDNodePropVendorIdSName},
{kDevKFDNodePropDeviceId, kDevKFDNodePropDeviceIdSName},
{kDevKFDNodePropLocationId, kDevKFDNodePropLocationIdSName},
{kDevKFDNodePropDrmRenderMinor, kDevKFDNodePropDrmRenderMinorSName},
{kDevKFDNodePropHiveId, kDevKFDNodePropHiveIdSName},
{kDevKFDNodePropNumSdmaEngines, kDevKFDNodePropNumSdmaEnginesSName},
{kDevKFDNodePropNumSdmaXgmiEngs, kDevKFDNodePropNumSdmaXgmiEngsSName},
{kDevKFDNodePropMaxEngClkFComp, kDevKFDNodePropMaxEngClkFCompSName},
{kDevKFDNodePropLocMemSz, kDevKFDNodePropLocMemSzSName},
{kDevKFDNodePropFwVer, kDevKFDNodePropFwVerSName},
{kDevKFDNodePropCapability, kDevKFDNodePropCapabilitySName},
{kDevKFDNodePropDbgProp, kDevKFDNodePropDbgPropSName},
{kDevKFDNodePropSdmaFwVer, kDevKFDNodePropSdmaFwVerSName},
{kDevKFDNodePropMaxEngClkCComp, kDevKFDNodePropMaxEngClkCCompSName},
{kDevKFDNodePropDomain, kDevKFDNodePropDomainSName},
};
// Strings that are found within sysfs files
static const char *kDevPerfLevelAutoStr = "auto";
static const char *kDevPerfLevelLowStr = "low";
static const char *kDevPerfLevelHighStr = "high";
static const char *kDevPerfLevelManualStr = "manual";
static const char *kDevPerfLevelStandardStr = "profile_standard";
static const char *kDevPerfLevelMinMClkStr = "profile_min_mclk";
static const char *kDevPerfLevelMinSClkStr = "profile_min_sclk";
static const char *kDevPerfLevelPeakStr = "profile_peak";
static const char *kDevPerfLevelDeterminismStr = "perf_determinism";
static const char *kDevPerfLevelUnknownStr = "unknown";
static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevPerfLevel, kDevPerfLevelFName},
{kDevOverDriveLevel, kDevOverDriveLevelFName},
{kDevMemOverDriveLevel, kDevMemOverDriveLevelFName},
{kDevBoardInfo, kDevBoardInfoFName},
{kDevDevProdName, kDevDevProdNameFName},
{kDevDevProdNum, kDevDevProdNumFName},
{kDevDevID, kDevDevIDFName},
{kDevXGMIPhysicalID, kDevXGMIPhysicalIDFName},
{kDevXGMIPortNum, kDevXGMIPortNumFName},
{kDevDevRevID, kDevDevRevIDFName},
{kDevVendorID, kDevVendorIDFName},
{kDevPCieVendorID, kDevPCieVendorIDFName},
{kDevSubSysDevID, kDevSubSysDevIDFName},
{kDevSubSysVendorID, kDevSubSysVendorIDFName},
{kDevGPUMClk, kDevGPUMClkFName},
{kDevGPUSClk, kDevGPUSClkFName},
{kDevDCEFClk, kDevDCEFClkFName},
{kDevFClk, kDevFClkFName},
{kDevSOCClk, kDevSOCClkFName},
{kDevPCIEClk, kDevPCIEClkFName},
{kDevPowerProfileMode, kDevPowerProfileModeFName},
{kDevUsage, kDevUsageFName},
{kDevPowerODVoltage, kDevPowerODVoltageFName},
{kDevVBiosVer, kDevVBiosVerFName},
{kDevPCIEThruPut, kDevPCIEThruPutFName},
{kDevErrCntSDMA, kDevErrCntSDMAFName},
{kDevErrCntUMC, kDevErrCntUMCFName},
{kDevErrCntGFX, kDevErrCntGFXFName},
{kDevErrCntMMHUB, kDevErrCntMMHUBFName},
{kDevErrCntPCIEBIF, kDevErrCntPCIEBIFFName},
{kDevErrCntHDP, kDevErrCntHDPFName},
{kDevErrCntXGMIWAFL, kDevErrCntXGMIWAFLFName},
{kDevErrCntFeatures, kDevErrCntFeaturesFName},
{kDevErrTableVersion, kDevErrTableVersionFName},
{kDevErrRASSchema, kDevErrRASSchemaFName},
{kDevMemTotGTT, kDevMemTotGTTFName},
{kDevMemTotVisVRAM, kDevMemTotVisVRAMFName},
{kDevMemBusyPercent, kDevMemBusyPercentFName},
{kDevMemTotVRAM, kDevMemTotVRAMFName},
{kDevMemUsedGTT, kDevMemUsedGTTFName},
{kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName},
{kDevMemUsedVRAM, kDevMemUsedVRAMFName},
{kDevVramVendor, kDevVramVendorFName},
{kDevPCIEReplayCount, kDevPCIEReplayCountFName},
{kDevUniqueId, kDevUniqueIdFName},
{kDevDFCountersAvailable, kDevDFCountersAvailableFName},
{kDevXGMIError, kDevXGMIErrorFName},
{kDevFwVersionAsd, kDevFwVersionAsdFName},
{kDevFwVersionCe, kDevFwVersionCeFName},
{kDevFwVersionDmcu, kDevFwVersionDmcuFName},
{kDevFwVersionMc, kDevFwVersionMcFName},
{kDevFwVersionMe, kDevFwVersionMeFName},
{kDevFwVersionMec, kDevFwVersionMecFName},
{kDevFwVersionMec2, kDevFwVersionMec2FName},
{kDevFwVersionMes, kDevFwVersionMesFName},
{kDevFwVersionMesKiq, kDevFwVersionMesKiqFName},
{kDevFwVersionPfp, kDevFwVersionPfpFName},
{kDevFwVersionRlc, kDevFwVersionRlcFName},
{kDevFwVersionRlcSrlc, kDevFwVersionRlcSrlcFName},
{kDevFwVersionRlcSrlg, kDevFwVersionRlcSrlgFName},
{kDevFwVersionRlcSrls, kDevFwVersionRlcSrlsFName},
{kDevFwVersionSdma, kDevFwVersionSdmaFName},
{kDevFwVersionSdma2, kDevFwVersionSdma2FName},
{kDevFwVersionSmc, kDevFwVersionSmcFName},
{kDevFwVersionSos, kDevFwVersionSosFName},
{kDevFwVersionTaRas, kDevFwVersionTaRasFName},
{kDevFwVersionTaXgmi, kDevFwVersionTaXgmiFName},
{kDevFwVersionUvd, kDevFwVersionUvdFName},
{kDevFwVersionVce, kDevFwVersionVceFName},
{kDevFwVersionVcn, kDevFwVersionVcnFName},
{kDevFwVersionPldmBundle, kDevFwVersionPldmBundleFName},
{kDevSerialNumber, kDevSerialNumberFName},
{kDevMemPageBad, kDevMemPageBadFName},
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevPmMetrics, kDevPmMetricsFName},
{kDevSocPstate, kDevSocPstateFName},
{kDevXgmiPlpd, kDevXgmiPlpdFName},
{kDevProcessIsolation, kDevProcessIsolationFName},
{kDevShaderClean, kDevShaderCleanFName},
{kDevRegMetrics, kDevRegMetricsFName},
{kDevBaseBoardTempMetrics, kDevBaseBoardTempMetricsFName},
{kDevGpuBoardTempMetrics, kDevGpuBoardTempMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevAvailableComputePartition, kDevAvailableComputePartitionFName},
{kDevComputePartition, kDevComputePartitionFName},
{kDevMemoryPartition, kDevMemoryPartitionFName},
{kDevAvailableMemoryPartition, kDevAvailableMemoryPartitionFName},
{kDevSupportedXcpConfigs, kDevSupportedXcpConfigsFName},
{kDevSupportedNpsConfigs, kDevSupportedNpsConfigsFName},
{kDevXcpConfig, kDevXcpConfigFName},
// XCP config resource files
{kDevDecoderInst, kDevDecoderInstFName},
{kDevDecoderShared, kDevDecoderSharedFName},
{kDevEncoderInst, kDevEncoderInstFName},
{kDevEncoderShared, kDevEncoderSharedFName},
{kDevDmaInst, kDevDmaInstFName},
{kDevDmaShared, kDevDmaSharedFName},
{kDevJpegInst, kDevJpegInstFName},
{kDevJpegShared, kDevJpegSharedFName},
{kDevXccInst, kDevXccInstFName},
{kDevXccShared, kDevXccSharedFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
{RSMI_DEV_PERF_LEVEL_AUTO, kDevPerfLevelAutoStr},
{RSMI_DEV_PERF_LEVEL_LOW, kDevPerfLevelLowStr},
{RSMI_DEV_PERF_LEVEL_HIGH, kDevPerfLevelHighStr},
{RSMI_DEV_PERF_LEVEL_MANUAL, kDevPerfLevelManualStr},
{RSMI_DEV_PERF_LEVEL_STABLE_STD, kDevPerfLevelStandardStr},
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK, kDevPerfLevelMinMClkStr},
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK, kDevPerfLevelMinSClkStr},
{RSMI_DEV_PERF_LEVEL_STABLE_PEAK, kDevPerfLevelPeakStr},
{RSMI_DEV_PERF_LEVEL_DETERMINISM, kDevPerfLevelDeterminismStr},
{RSMI_DEV_PERF_LEVEL_UNKNOWN, kDevPerfLevelUnknownStr},
};
static const std::map<DevInfoTypes, uint8_t> kDevInfoVarTypeToRSMIVariant = {
// rsmi_memory_type_t
{kDevMemTotGTT, RSMI_MEM_TYPE_GTT},
{kDevMemTotVisVRAM, RSMI_MEM_TYPE_VIS_VRAM},
{kDevMemTotVRAM, RSMI_MEM_TYPE_VRAM},
{kDevMemUsedGTT, RSMI_MEM_TYPE_GTT},
{kDevMemUsedVisVRAM, RSMI_MEM_TYPE_VIS_VRAM},
{kDevMemUsedVRAM, RSMI_MEM_TYPE_VRAM},
// rsmi_clk_type_t
{kDevGPUSClk, RSMI_CLK_TYPE_SYS},
{kDevGPUMClk, RSMI_CLK_TYPE_MEM},
{kDevFClk, RSMI_CLK_TYPE_DF},
{kDevDCEFClk, RSMI_CLK_TYPE_DCEF},
{kDevSOCClk, RSMI_CLK_TYPE_SOC},
{kDevPCIEClk, RSMI_CLK_TYPE_PCIE},
// rsmi_fw_block_t
{kDevFwVersionAsd, RSMI_FW_BLOCK_ASD},
{kDevFwVersionCe, RSMI_FW_BLOCK_CE},
{kDevFwVersionDmcu, RSMI_FW_BLOCK_DMCU},
{kDevFwVersionMc, RSMI_FW_BLOCK_MC},
{kDevFwVersionMe, RSMI_FW_BLOCK_ME},
{kDevFwVersionMec, RSMI_FW_BLOCK_MEC},
{kDevFwVersionMec2, RSMI_FW_BLOCK_MEC2},
{kDevFwVersionMes, RSMI_FW_BLOCK_MES},
{kDevFwVersionMesKiq, RSMI_FW_BLOCK_MES_KIQ},
{kDevFwVersionPfp, RSMI_FW_BLOCK_PFP},
{kDevFwVersionRlc, RSMI_FW_BLOCK_RLC},
{kDevFwVersionRlcSrlc, RSMI_FW_BLOCK_RLC_SRLC},
{kDevFwVersionRlcSrlg, RSMI_FW_BLOCK_RLC_SRLG},
{kDevFwVersionRlcSrls, RSMI_FW_BLOCK_RLC_SRLS},
{kDevFwVersionSdma, RSMI_FW_BLOCK_SDMA},
{kDevFwVersionSdma2, RSMI_FW_BLOCK_SDMA2},
{kDevFwVersionSmc, RSMI_FW_BLOCK_SMC},
{kDevFwVersionSos, RSMI_FW_BLOCK_SOS},
{kDevFwVersionTaRas, RSMI_FW_BLOCK_TA_RAS},
{kDevFwVersionTaXgmi, RSMI_FW_BLOCK_TA_XGMI},
{kDevFwVersionUvd, RSMI_FW_BLOCK_UVD},
{kDevFwVersionVce, RSMI_FW_BLOCK_VCE},
{kDevFwVersionVcn, RSMI_FW_BLOCK_VCN},
{kDevFwVersionPldmBundle, RSMI_FW_BLOCK_PLDM_BUNDLE},
// rsmi_gpu_block_t
{kDevErrCntUMC, RSMI_GPU_BLOCK_UMC},
{kDevErrCntSDMA, RSMI_GPU_BLOCK_SDMA},
{kDevErrCntGFX, RSMI_GPU_BLOCK_GFX},
{kDevErrCntMMHUB, RSMI_GPU_BLOCK_MMHUB},
{kDevErrCntPCIEBIF, RSMI_GPU_BLOCK_PCIE_BIF},
{kDevErrCntHDP, RSMI_GPU_BLOCK_HDP},
{kDevErrCntXGMIWAFL, RSMI_GPU_BLOCK_XGMI_WAFL},
// rsmi_event_group_t
{kDevDFCountersAvailable, RSMI_EVNT_GRP_XGMI}
};
const std::map<DevInfoTypes, const char*>
Device::devInfoTypesStrings = {
{kDevPerfLevel, "kDevPerfLevel"},
{kDevOverDriveLevel, "kDevOverDriveLevel"},
{kDevMemOverDriveLevel, "kDevMemOverDriveLevel"},
{kDevDevID, "kDevDevID"},
{kDevXGMIPhysicalID, "kDevXGMIPhysicalID"},
{kDevXGMIPortNum, "kDevXGMIPortNum"},
{kDevDevRevID, "kDevDevRevID"},
{kDevDevProdName, "kDevDevProdName"},
{kDevBoardInfo, "kDevBoardInfo"},
{kDevDevProdNum, "kDevDevProdNum"},
{kDevVendorID, "kDevVendorID"},
{kDevSubSysDevID, "kDevSubSysDevID"},
{kDevSubSysVendorID, "kDevSubSysVendorID"},
{kDevGPUMClk, "kDevGPUMClk"},
{kDevGPUSClk, "kDevGPUSClk"},
{kDevDCEFClk, "kDevDCEFClk"},
{kDevFClk, "kDevFClk"},
{kDevSOCClk, "kDevSOCClk"},
{kDevPCIEClk, "kDevPCIEClk"},
{kDevPowerProfileMode, "kDevPowerProfileMode"},
{kDevUsage, "kDevUsage"},
{kDevPowerODVoltage, "kDevPowerODVoltage"},
{kDevVBiosVer, "kDevVBiosVer"},
{kDevPCIEThruPut, "kDevPCIEThruPut"},
{kDevErrCntSDMA, "kDevErrCntSDMA"},
{kDevErrCntUMC, "kDevErrCntUMC"},
{kDevErrCntGFX, "kDevErrCntGFX"},
{kDevErrCntMMHUB, "kDevErrCntMMHUB"},
{kDevErrCntPCIEBIF, "kDevErrCntPCIEBIF"},
{kDevErrCntHDP, "kDevErrCntHDP"},
{kDevErrCntXGMIWAFL, "kDevErrCntXGMIWAFL"},
{kDevErrCntFeatures, "kDevErrCntFeatures"},
{kDevErrRASSchema, "kDevErrRASSchema"},
{kDevErrTableVersion, "kDevErrTableVersion"},
{kDevMemTotGTT, "kDevMemTotGTT"},
{kDevMemTotVisVRAM, "kDevMemTotVisVRAM"},
{kDevMemTotVRAM, "kDevMemTotVRAM"},
{kDevMemUsedGTT, "kDevMemUsedGTT"},
{kDevMemUsedVisVRAM, "kDevMemUsedVisVRAM"},
{kDevMemUsedVRAM, "kDevMemUsedVRAM"},
{kDevVramVendor, "kDevVramVendor"},
{kDevPCIEReplayCount, "kDevPCIEReplayCount"},
{kDevUniqueId, "kDevUniqueId"},
{kDevDFCountersAvailable, "kDevDFCountersAvailable"},
{kDevMemBusyPercent, "kDevMemBusyPercent"},
{kDevXGMIError, "kDevXGMIError"},
{kDevFwVersionAsd, "kDevFwVersionAsd"},
{kDevFwVersionCe, "kDevFwVersionCe"},
{kDevFwVersionDmcu, "kDevFwVersionDmcu"},
{kDevFwVersionMc, "kDevFwVersionMc"},
{kDevFwVersionMe, "kDevFwVersionMe"},
{kDevFwVersionMec, "kDevFwVersionMec"},
{kDevFwVersionMec2, "kDevFwVersionMec2"},
{kDevFwVersionMes, "kDevFwVersionMes"},
{kDevFwVersionMesKiq, "kDevFwVersionMesKiq"},
{kDevFwVersionPfp, "kDevFwVersionPfp"},
{kDevFwVersionRlc, "kDevFwVersionRlc"},
{kDevFwVersionRlcSrlc, "kDevFwVersionRlcSrlc"},
{kDevFwVersionRlcSrlg, "kDevFwVersionRlcSrlg"},
{kDevFwVersionRlcSrls, "kDevFwVersionRlcSrls"},
{kDevFwVersionSdma, "kDevFwVersionSdma"},
{kDevFwVersionSdma2, "kDevFwVersionSdma2"},
{kDevFwVersionSmc, "kDevFwVersionSmc"},
{kDevFwVersionSos, "kDevFwVersionSos"},
{kDevFwVersionTaRas, "kDevFwVersionTaRas"},
{kDevFwVersionTaXgmi, "kDevFwVersionTaXgmi"},
{kDevFwVersionUvd, "kDevFwVersionUvd"},
{kDevFwVersionVce, "kDevFwVersionVce"},
{kDevFwVersionVcn, "kDevFwVersionVcn"},
{kDevFwVersionPldmBundle, "kDevFwVersionPldmBundle"},
{kDevSerialNumber, "kDevSerialNumber"},
{kDevMemPageBad, "kDevMemPageBad"},
{kDevNumaNode, "kDevNumaNode"},
{kDevGpuMetrics, "kDevGpuMetrics"},
{kDevPmMetrics, "kDevPmMetrics"},
{kDevRegMetrics, "kDevRegMetrics"},
{kDevBaseBoardTempMetrics, "kDevBaseBoardTempMetrics"},
{kDevGpuBoardTempMetrics, "kDevGpuBoardTempMetrics"},
{kDevGpuReset, "kDevGpuReset"},
{kDevAvailableComputePartition, "kDevAvailableComputePartition"},
{kDevComputePartition, "kDevComputePartition"},
{kDevMemoryPartition, "kDevMemoryPartition"},
{kDevAvailableMemoryPartition, "kDevAvailableMemoryPartition"},
{kDevPCieVendorID, "kDevPCieVendorID"},
{kDevSocPstate, "kDevSocPstate"},
{kDevXgmiPlpd, "kDevXgmiPlpd"},
{kDevProcessIsolation, "kDevProcessIsolation"},
{kDevShaderClean, "kDevShaderClean"},
{kDevSupportedXcpConfigs, "kDevSupportedXcpConfigs"},
{kDevSupportedNpsConfigs, "kDevSupportedNpsConfigs"},
{kDevXcpConfig, "kDevXcpConfig"},
{kDevDecoderInst, "kDevDecoderInst"},
{kDevDecoderShared, "kDevDecoderShared"},
{kDevEncoderInst, "kDevEncoderInst"},
{kDevEncoderShared, "kDevEncoderShared"},
{kDevDmaInst, "kDevDmaInst"},
{kDevDmaShared, "kDevDmaShared"},
{kDevJpegInst, "kDevJpegInst"},
{kDevJpegShared, "kDevJpegShared"},
{kDevXccInst, "kDevXccInst"},
{kDevXccShared, "kDevXccShared"},
};
static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
// Functions with only mandatory dependencies
{"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}},
{"rsmi_dev_id_get", {{kDevDevIDFName}, {}}},
{"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}},
{"rsmi_dev_xgmi_port_num_get", {{kDevXGMIPortNumFName}, {}}},
{"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}},
{"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_name_get", {{kDevVendorIDFName,
kDevDevIDFName}, {}}},
{"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}},
{"rsmi_dev_pcie_slot_type_get", {{kDevBoardInfoFName}, {}}},
{"rsmi_dev_brand_get", {{kDevVendorIDFName,
kDevVBiosVerFName}, {}}},
{"rsmi_dev_vendor_name_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_serial_number_get", {{kDevSerialNumberFName}, {}}},
{"rsmi_dev_subsystem_id_get", {{kDevSubSysDevIDFName}, {}}},
{"rsmi_dev_subsystem_name_get", {{kDevSubSysVendorIDFName,
kDevVendorIDFName,
kDevDevIDFName}, {}}},
{"rsmi_dev_drm_render_minor_get", {{}, {}}},
{"rsmi_dev_subsystem_vendor_id_get", {{kDevSubSysVendorIDFName}, {}}},
{"rsmi_dev_unique_id_get", {{kDevUniqueIdFName}, {}}},
{"rsmi_dev_pci_bandwidth_get", {{kDevPCIEClkFName}, {}}},
{"rsmi_dev_pci_id_get", {{}, {}}},
{"rsmi_dev_pci_throughput_get", {{kDevPCIEThruPutFName}, {}}},
{"rsmi_dev_pci_replay_counter_get", {{kDevPCIEReplayCountFName}, {}}},
{"rsmi_dev_pci_bandwidth_set", {{kDevPerfLevelFName,
kDevPCIEClkFName}, {}}},
{"rsmi_dev_power_profile_set", {{kDevPerfLevelFName,
kDevPowerProfileModeFName}, {}}},
{"rsmi_dev_memory_busy_percent_get", {{kDevMemBusyPercentFName}, {}}},
{"rsmi_dev_busy_percent_get", {{kDevUsageFName}, {}}},
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
{"rsmi_dev_overdrive_level_get", {{kDevOverDriveLevelFName}, {}}},
{"rsmi_dev_mem_overdrive_level_get", {{kDevMemOverDriveLevelFName}, {}}},
{"rsmi_dev_power_profile_presets_get", {{kDevPowerProfileModeFName}, {}}},
{"rsmi_dev_perf_level_set", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_set_v1", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_get", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_soc_pstate_set", {{kDevSocPstateFName}, {}}},
{"rsmi_dev_soc_pstate_get", {{kDevSocPstateFName}, {}}},
{"rsmi_dev_xgmi_plpd_set", {{kDevXgmiPlpdFName}, {}}},
{"rsmi_dev_xgmi_plpd_get", {{kDevXgmiPlpdFName}, {}}},
{"rsmi_dev_process_isolation_set", {{kDevProcessIsolationFName}, {}}},
{"rsmi_dev_process_isolation_get", {{kDevProcessIsolationFName}, {}}},
{"rsmi_dev_gpu_shader_clean", {{kDevShaderCleanFName}, {}}},
{"rsmi_perf_determinism_mode_set", {{kDevPerfLevelFName,
kDevPowerODVoltageFName}, {}}},
{"rsmi_dev_overdrive_level_set", {{kDevOverDriveLevelFName}, {}}},
{"rsmi_dev_vbios_version_get", {{kDevVBiosVerFName}, {}}},
{"rsmi_dev_od_volt_info_get", {{kDevPowerODVoltageFName}, {}}},
{"rsmi_dev_od_volt_info_set", {{kDevPowerODVoltageFName,
kDevPerfLevelFName}, {}}},
{"rsmi_dev_od_volt_curve_regions_get", {{kDevPowerODVoltageFName}, {}}},
{"rsmi_dev_ecc_enabled_get", {{kDevErrCntFeaturesFName}, {}}},
{"rsmi_dev_ecc_status_get", {{kDevErrCntFeaturesFName}, {}}},
{"rsmi_ras_feature_info_get", {{kDevErrRASSchemaFName,
kDevErrTableVersionFName}, {}}},
{"rsmi_dev_counter_group_supported", {{}, {}}},
{"rsmi_dev_counter_create", {{}, {}}},
{"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}},
{"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}},
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
{"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}},
{"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}},
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
{"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}},
{"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}},
{"rsmi_dev_memory_partition_get", {{kDevMemoryPartitionFName}, {}}},
{"rsmi_dev_memory_partition_set", {{kDevMemoryPartitionFName}, {}}},
// These functions with variants, but no sensors/units. (May or may not
// have mandatory dependencies.)
{"rsmi_dev_memory_total_get", { .mandatory_depends = {},
.variants = {
kDevMemTotGTT, kDevMemTotVisVRAM,
kDevMemTotVRAM,
}
}
},
{"rsmi_dev_memory_usage_get", { .mandatory_depends = {},
.variants = {
kDevMemUsedGTT,
kDevMemUsedVisVRAM,
kDevMemUsedVRAM,
}
}
},
{"rsmi_dev_gpu_clk_freq_get", { .mandatory_depends = {},
.variants = {
kDevGPUSClk,
kDevGPUMClk,
kDevFClk,
kDevDCEFClk,
kDevSOCClk,
}
}
},
{"rsmi_dev_gpu_clk_freq_set", { .mandatory_depends =
{kDevPerfLevelFName},
.variants = {
kDevGPUSClk,
kDevGPUMClk,
kDevFClk,
kDevDCEFClk,
kDevSOCClk,
}
}
},
{"rsmi_dev_firmware_version_get", { .mandatory_depends = {},
.variants = {
kDevFwVersionAsd,
kDevFwVersionCe,
kDevFwVersionDmcu,
kDevFwVersionMc,
kDevFwVersionMe,
kDevFwVersionMec,
kDevFwVersionMec2,
kDevFwVersionMes,
kDevFwVersionMesKiq,
kDevFwVersionPfp,
kDevFwVersionRlc,
kDevFwVersionRlcSrlc,
kDevFwVersionRlcSrlg,
kDevFwVersionRlcSrls,
kDevFwVersionSdma,
kDevFwVersionSdma2,
kDevFwVersionSmc,
kDevFwVersionSos,
kDevFwVersionTaRas,
kDevFwVersionTaXgmi,
kDevFwVersionUvd,
kDevFwVersionVce,
kDevFwVersionVcn,
kDevFwVersionPldmBundle,
}
}
},
{"rsmi_dev_ecc_count_get", { .mandatory_depends = {},
.variants = {
kDevErrCntUMC,
kDevErrCntSDMA,
kDevErrCntGFX,
kDevErrCntMMHUB,
kDevErrCntPCIEBIF,
kDevErrCntHDP,
kDevErrCntXGMIWAFL,
}
}
},
{"rsmi_counter_available_counters_get", { .mandatory_depends = {},
.variants = {
kDevDFCountersAvailable,
}
}
},
};
#define RET_IF_NONZERO(X) { \
if (X) return X; \
}
Device::Device(std::string p, RocmSMI_env_vars const *e) :
monitor_(nullptr), path_(p), env_(e), evt_notif_anon_fd_(-1),
m_gpu_metrics_header{0, 0, 0} {
#ifndef DEBUG
env_ = nullptr;
#endif
// Get the device name
size_t i = path_.rfind('/', path_.length());
std::string dev = path_.substr(i + 1, path_.length() - i);
std::string m_name("/rocm_smi_");
m_name += dev;
mutex_ = shared_mutex_init(m_name.c_str(), 0777);
if (mutex_.ptr == nullptr) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to create shared mem. mutex.");
}
}
Device:: ~Device() {
shared_mutex_close(mutex_);
}
template <typename T>
int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) {
std::string debugfs_path;
debugfs_path = kPathDebugRootFName;
debugfs_path += std::to_string(index());
debugfs_path += "/";
debugfs_path += kDevAttribNameMap.at(type);
DBG_FILE_ERROR(debugfs_path, str);
bool reg_file;
int ret = isRegularFile(debugfs_path, &reg_file);
if (ret != 0) {
return ret;
}
if (!reg_file) {
return ENOENT;
}
fs->open(debugfs_path);
if (!fs->is_open()) {
return errno;
}
return 0;
}
std::string Device::get_sys_file_path_by_type(DevInfoTypes type) const {
auto sysfs_path = path_;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
if (access(sysfs_path.c_str(), F_OK) != 0) {
sysfs_path.clear();
}
return sysfs_path;
}
// The fallback sysfs to handle backward compatibilities
static const std::map<DevInfoTypes, std::string> kDevFallbackFile = {
{kDevErrCntGFX, "ras/aca_gfx"},
{kDevErrCntSDMA, "ras/aca_sdma"},
{kDevErrCntUMC, "ras/aca_umc"},
{kDevErrCntMMHUB, "ras/aca_mmhub"},
{kDevErrCntPCIEBIF, "ras/aca_pcie_bif"},
{kDevErrCntHDP, "ras/aca_hdp"},
{kDevErrCntXGMIWAFL, "ras/aca_xgmi_wafl"},
};
template <typename T>
int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
auto sysfs_path = path_;
std::ostringstream ss;
#ifdef DEBUG
if (env_->path_DRM_root_override
&& (env_->enum_overrides.find(type) != env_->enum_overrides.end())) {
sysfs_path = env_->path_DRM_root_override;
}
#endif
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
// For the file under PCI sysfs
if (type >= kDevPCieTypeStart && type <= kDevPCieTypeEND) {
sysfs_path = "/sys/bus/pci/devices/";
std::string bdf_str;
if (getBDFWithDomain(bdfid_, bdf_str) != RSMI_STATUS_SUCCESS) {
ss << "Fail to craft the bdf string";
LOG_ERROR(ss);
return 1;
}
sysfs_path += bdf_str;
sysfs_path += "/";
sysfs_path += kDevAttribNameMap.at(type);
}
DBG_FILE_ERROR(sysfs_path, str);
bool reg_file;
int ret = isRegularFile(sysfs_path, &reg_file);
if (ret != 0 || !reg_file) {
// Handle specific types if the file does not exist
if (kDevFallbackFile.find(type) != kDevFallbackFile.end()) {
sysfs_path = path_ + "/device/" + kDevFallbackFile.at(type);
DBG_FILE_ERROR(sysfs_path, str);
// Recheck the adjusted path
ret = isRegularFile(sysfs_path, &reg_file);
if (ret != 0 || !reg_file) {
ss << __PRETTY_FUNCTION__
<< " | Adjusted file path also does not exist - SYSFS file ("
<< sysfs_path
<< ") for DevInfoInfoType (" << get_type_string(type)
<< "), returning " << std::to_string(ret);
LOG_ERROR(ss);
return ret;
}
}
}
if (ret != 0) {
ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file ("
<< sysfs_path
<< ") for DevInfoInfoType (" << get_type_string(type)
<< "), returning " << std::to_string(ret);
LOG_ERROR(ss);
return ret;
}
if (!reg_file) {
ss << __PRETTY_FUNCTION__
<< " | Issue: File is not a regular file - SYSFS file ("
<< sysfs_path << ") for "
<< "DevInfoInfoType (" << get_type_string(type) << "),"
<< " returning ENOENT (" << std::strerror(ENOENT) << ")";
LOG_ERROR(ss);
return ENOENT;
}
fs->open(sysfs_path);
if (!fs->is_open()) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for "
<< "DevInfoTypes (" << get_type_string(type) << "), "
<< ", returning " << std::to_string(errno) << " ("
<< std::strerror(errno) << ")";
LOG_ERROR(ss);
return errno;
}
ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file ("
<< sysfs_path
<< ") for DevInfoTypes (" << get_type_string(type)
<< ")";
LOG_INFO(ss);
return 0;
}
int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
std::ifstream fs;
std::string line;
int ret = 0;
std::ostringstream ss;
assert(retStr != nullptr);
ret = openDebugFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read debugInfoStr for DevInfoType ("
<< get_type_string(type) << "), returning "
<< std::to_string(ret);
LOG_ERROR(ss);
return ret;
}
if (!(fs.peek() == std::ifstream::traits_type::eof())) {
getline(fs, line);
*retStr = line;
}
fs.close();
ss << "Successfully read debugInfoStr for DevInfoType ("
<< get_type_string(type) << "), retString= " << *retStr;
LOG_INFO(ss);
return 0;
}
int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
std::ifstream fs;
int ret = 0;
std::ostringstream ss;
assert(retStr != nullptr);
ret = openSysfsFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read device info string for DevInfoType ("
<< get_type_string(type) << "), returning "
<< std::to_string(ret);
LOG_ERROR(ss);
return ret;
}
fs >> *retStr;
fs.close();
ss << __PRETTY_FUNCTION__
<< "Successfully read device info string for DevInfoType ("
<< get_type_string(type) << "): " + *retStr
<< " | "
<< (fs.is_open() ? " File stream is opened" : " File stream is closed")
<< " | " << (fs.bad() ? "[ERROR] Bad read operation" :
"[GOOD] No bad bit read, successful read operation")
<< " | " << (fs.fail() ? "[ERROR] Failed read - format error" :
"[GOOD] No fail - Successful read operation")
<< " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" :
"[GOOD] No eof - Successful read operation")
<< " | " << (fs.good() ? "[GOOD] read good - Successful read operation" :
"[ERROR] Failed read - good error");
LOG_INFO(ss);
return 0;
}
int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
bool returnWriteErr) {
// returnWriteErr = false, backwards compatability (old calls)
// returnWriteErr = true, improvement - allows us to detect errors
// when writing to file
// (such as EBUSY)
auto sysfs_path = path_;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
std::ofstream fs;
int ret;
std::ostringstream ss;
fs.flush();
fs.rdbuf()->pubsetbuf(0, 0);
ret = openSysfsFileStream(type, &fs, valStr.c_str());
if (ret != 0) {
fs.close();
ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; "
<< "Could not write device info string (" << valStr
<< ") for DevInfoType (" << get_type_string(type)
<< "), returning " << std::to_string(ret);
LOG_ERROR(ss);
return ret;
}
// We'll catch any exceptions in rocm_smi.cc code.
if (fs << valStr) {
fs.flush();
fs.close();
ss << "Successfully wrote device info string (" << valStr
<< ") for DevInfoType (" << get_type_string(type)
<< "), returning RSMI_STATUS_SUCCESS";
LOG_INFO(ss);
ret = RSMI_STATUS_SUCCESS;
} else {
if (returnWriteErr) {
ret = errno;
} else {
ret = RSMI_STATUS_NOT_SUPPORTED;
}
fs.flush();
fs.close();
ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; "
<< "Could not write device info string (" << valStr
<< ") for DevInfoType (" << get_type_string(type)
<< "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret));
ss << " | "
<< (fs.is_open() ? "[ERROR] File stream open" :
"[GOOD] File stream closed")
<< " | " << (fs.bad() ? "[ERROR] Bad write operation" :
"[GOOD] No bad bit write, successful write operation")
<< " | " << (fs.fail() ? "[ERROR] Failed write - format error" :
"[GOOD] No fail - Successful write operation")
<< " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" :
"[GOOD] No eof - Successful write operation")
<< " | " << (fs.good() ?
"[GOOD] Write good - Successful write operation" :
"[ERROR] Failed write - good error");
LOG_ERROR(ss);
}
return ret;
}
rsmi_dev_perf_level Device::perfLvlStrToEnum(std::string s) {
rsmi_dev_perf_level pl;
for (pl = RSMI_DEV_PERF_LEVEL_FIRST; pl <= RSMI_DEV_PERF_LEVEL_LAST; ) {
if (s == kDevPerfLvlMap.at(pl)) {
return pl;
}
pl = static_cast<rsmi_dev_perf_level>(static_cast<uint32_t>(pl) + 1);
}
return RSMI_DEV_PERF_LEVEL_UNKNOWN;
}
int Device::writeDevInfo(DevInfoTypes type, uint64_t val) {
switch (type) {
// The caller is responsible for making sure "val" is within a valid range
case kDevOverDriveLevel: // integer between 0 and 20
case kDevPowerODVoltage:
case kDevPowerProfileMode:
return writeDevInfoStr(type, std::to_string(val));
break;
case kDevPerfLevel: // string: "auto", "low", "high", "manual", ...
return writeDevInfoStr(type,
kDevPerfLvlMap.at((rsmi_dev_perf_level)val));
break;
default:
return EINVAL;
}
return -1;
}
int Device::writeDevInfo(DevInfoTypes type, std::string val) {
auto sysfs_path = path_;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
switch (type) {
case kDevGPUMClk:
case kDevSocPstate:
case kDevXgmiPlpd:
case kDevProcessIsolation:
case kDevShaderClean:
case kDevDCEFClk:
case kDevFClk:
case kDevGPUSClk:
case kDevPCIEClk:
case kDevPowerODVoltage:
case kDevSOCClk:
return writeDevInfoStr(type, val);
case kDevComputePartition:
case kDevMemoryPartition:
case kDevXcpConfig:
return writeDevInfoStr(type, val, true);
default:
return EINVAL;
}
return -1;
}
int Device::readDevInfoLine(DevInfoTypes type, std::string *line) {
int ret;
std::ifstream fs;
std::ostringstream ss;
assert(line != nullptr);
ret = openSysfsFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read DevInfoLine for DevInfoType ("
<< get_type_string(type) << ")";
LOG_ERROR(ss);
return ret;
}
std::getline(fs, *line);
ss << "Successfully read DevInfoLine for DevInfoType ("
<< get_type_string(type) << "), returning *line = "
<< *line;
LOG_INFO(ss);
fs.close();
return 0;
}
const char* Device::get_type_string(DevInfoTypes type) {
auto ite = devInfoTypesStrings.find(type);
if (ite != devInfoTypesStrings.end()) {
return ite->second;
}
return "Unknown";
}
namespace {
static int read_env_ms(const char* name, int def) {
if (const char* s = std::getenv(name)) {
try {
return std::max(0, std::stoi(s));
} catch (...) {
// Ignore error, fallback on 100 ms default
}
}
return def;
}
struct GpuMetricsCache {
std::vector<uint8_t> data;
std::chrono::steady_clock::time_point last_read;
std::mutex mtx;
};
GpuMetricsCache g_gpu_metrics_cache;
// Keep 1 cache map, with an entry for each gpu
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
std::mutex g_gpu_metrics_cache_map_mu;
static const std::chrono::milliseconds kGpuMetricsCacheDuration(
read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 1)
);
}
int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
void *p_binary_data) {
auto sysfs_path = path_;
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | AMDSMI_GPU_METRICS_CACHE_MS = "
<< kGpuMetricsCacheDuration.count()
<< " ms";
LOG_DEBUG(ss);
// Size will either be 4, or 3872+. When 4, it's only reading from the header.
// If this header read is inconsequential, we could only cache full read.
// However, it seems reading the gpu_metrics sysfs in any capacity
// is the issue, so should remain.
const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type)
+ "#" + std::to_string(b_size);
GpuMetricsCache* cache_ptr = nullptr;
{
std::lock_guard<std::mutex> map_lk(g_gpu_metrics_cache_map_mu);
cache_ptr = &g_gpu_metrics_cache_map[key];
}
// Only cache for kDevGpuMetrics
if (type == DevInfoTypes::kDevGpuMetrics) {
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
auto now = std::chrono::steady_clock::now();
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
if (!cache_ptr->data.empty() &&
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero() &&
last_read_delta < kGpuMetricsCacheDuration &&
cache_ptr->data.size() == b_size) {
std::memcpy(p_binary_data, cache_ptr->data.data(), b_size);
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Returned cached DevInfoBinary for DevInfoType ("
<< get_type_string(type) << ")";
LOG_INFO(ss);
}
return 0;
}
}
FILE *ptr;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
ptr = fopen(sysfs_path.c_str(), "rb");
if (!ptr) {
ss << "Could not read DevInfoBinary for DevInfoType ("
<< get_type_string(type) << ")"
<< " - SYSFS (" << sysfs_path << ")"
<< ", returning " << std::to_string(errno) << " ("
<< std::strerror(errno) << ")";
LOG_ERROR(ss);
return errno;
}
size_t num = fread(p_binary_data, b_size, 1, ptr);
fclose(ptr);
if ((num*b_size) != b_size) {
ss << "Could not read DevInfoBinary for DevInfoType ("
<< get_type_string(type) << ") - SYSFS ("
<< sysfs_path << "), binary size error; "
<< "[buff: "
<< p_binary_data
<< " size: "
<< b_size
<< " read: "
<< num
<< "]"
<< ", returning ENOENT (" << std::strerror(ENOENT) << ")";
LOG_ERROR(ss);
return ENOENT;
}
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Successfully read DevInfoBinary for DevInfoType ("
<< get_type_string(type) << ") - SYSFS ("
<< sysfs_path << "), returning binaryData = " << p_binary_data
<< "; byte_size = " << std::dec << static_cast<int>(b_size);
std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), "
+ sysfs_path;
logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16);
LOG_INFO(ss);
}
// Cache metric data
if (type == DevInfoTypes::kDevGpuMetrics &&
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero()) {
auto now = std::chrono::steady_clock::now();
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
cache_ptr->data.assign(
reinterpret_cast<uint8_t*>(p_binary_data),
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
cache_ptr->last_read = now;
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Successfully Cached GPU Metrics binaryData = " << p_binary_data
<< "; byte_size = " << std::dec << static_cast<int>(b_size);
LOG_INFO(ss);
}
}
return 0;
}
int Device::readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec) {
std::string line;
int ret;
std::ifstream fs;
std::string allLines;
std::ostringstream ss;
assert(retVec != nullptr);
ret = openSysfsFileStream(type, &fs);
if (ret != 0) {
return ret;
}
while (std::getline(fs, line)) {
retVec->push_back(line);
}
fs.close();
if (retVec->empty()) {
ss << "Read devInfoMultiLineStr for DevInfoType ("
<< get_type_string(type) << ")"
<< ", but contained no string lines";
LOG_ERROR(ss);
return ENXIO;
}
// Remove any *trailing* empty (whitespace) lines
while (!retVec->empty() &&
retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
retVec->pop_back();
}
// allow logging output of multiline strings
for (const auto& l: *retVec) {
allLines += "\n" + l;
}
if (!allLines.empty()) {
ss << "Successfully read devInfoMultiLineStr for DevInfoType ("
<< get_type_string(type) << ") "
<< ", returning lines read = " << allLines;
LOG_INFO(ss);
} else {
ss << "Read devInfoMultiLineStr for DevInfoType ("
<< get_type_string(type) << ")"
<< ", but lines were empty";
LOG_INFO(ss);
return ENXIO;
}
return 0;
}
int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
assert(val != nullptr);
std::string tempStr;
int ret;
int tmp_val;
std::ostringstream ss;
switch (type) {
case kDevDevID:
case kDevDevRevID:
case kDevSubSysDevID:
case kDevSubSysVendorID:
case kDevVendorID:
case kDevPCieVendorID:
case kDevErrCntFeatures:
case kDevXGMIPhysicalID:
case kDevXGMIPortNum:
case kDevErrRASSchema:
case kDevErrTableVersion:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr.empty()) {
return EINVAL;
}
tmp_val = std::stoi(tempStr, nullptr, 16);
if (tmp_val < 0) {
return EINVAL;
}
*val = static_cast<uint64_t>(tmp_val);
break;
case kDevUsage:
case kDevOverDriveLevel:
case kDevMemOverDriveLevel:
case kDevMemTotGTT:
case kDevMemTotVisVRAM:
case kDevMemTotVRAM:
case kDevMemUsedGTT:
case kDevMemUsedVisVRAM:
case kDevMemUsedVRAM:
case kDevPCIEReplayCount:
case kDevDFCountersAvailable:
case kDevMemBusyPercent:
case kDevXGMIError:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr.empty()) {
return EINVAL;
}
*val = std::stoul(tempStr, nullptr);
break;
case kDevUniqueId:
case kDevFwVersionAsd:
case kDevFwVersionCe:
case kDevFwVersionDmcu:
case kDevFwVersionMc:
case kDevFwVersionMe:
case kDevFwVersionMec:
case kDevFwVersionMec2:
case kDevFwVersionMes:
case kDevFwVersionMesKiq:
case kDevFwVersionPfp:
case kDevFwVersionRlc:
case kDevFwVersionRlcSrlc:
case kDevFwVersionRlcSrlg:
case kDevFwVersionRlcSrls:
case kDevFwVersionSdma:
case kDevFwVersionSdma2:
case kDevFwVersionSmc:
case kDevFwVersionSos:
case kDevFwVersionTaRas:
case kDevFwVersionTaXgmi:
case kDevFwVersionUvd:
case kDevFwVersionVce:
case kDevFwVersionVcn:
case kDevFwVersionPldmBundle:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr.empty()) {
return EINVAL;
}
*val = std::stoul(tempStr, nullptr, 16);
break;
case kDevGpuReset:
ret = readDebugInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
break;
default:
return EINVAL;
}
return 0;
}
// Read a property from a file which may contain multiple properties
int Device::readDevInfo(DevInfoTypes type, const std::string& property,
std::string& value) {
std::vector<std::string> val;
int ret = 0;
switch (type) {
case kDevBoardInfo:
ret = readDevInfoMultiLineStr(type, &val);
break;
default:
return EINVAL;
}
if (ret != 0) return ret;
// Find the property from the file
for (unsigned int i = 0; i < val.size(); i++) {
auto pos = val[i].find(":"); // delimiter
if (pos == std::string::npos) continue;
auto name = trim(val[i].substr(0, pos));
if (name != property) continue;
value = trim(val[i].substr(pos+1));
return 0;
}
return EINVAL;
}
int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
assert(val != nullptr);
switch (type) {
case kDevGPUMClk:
case kDevSocPstate:
case kDevXgmiPlpd:
case kDevProcessIsolation:
case kDevGPUSClk:
case kDevDCEFClk:
case kDevFClk:
case kDevPCIEClk:
case kDevSOCClk:
case kDevPowerProfileMode:
case kDevPowerODVoltage:
case kDevErrCntSDMA:
case kDevErrCntUMC:
case kDevErrCntGFX:
case kDevErrCntMMHUB:
case kDevErrCntPCIEBIF:
case kDevErrCntHDP:
case kDevErrCntXGMIWAFL:
case kDevMemPageBad:
return readDevInfoMultiLineStr(type, val);
break;
default:
return EINVAL;
}
return 0;
}
int Device::readDevInfo(DevInfoTypes type, std::size_t b_size,
void *p_binary_data) {
assert(p_binary_data != nullptr);
switch (type) {
case kDevGpuMetrics:
return readDevInfoBinary(type, b_size, p_binary_data);
break;
default:
return EINVAL;
}
return 0;
}
int Device::readDevInfo(DevInfoTypes type, std::string *val) {
assert(val != nullptr);
switch (type) {
case kDevPerfLevel:
case kDevUsage:
case kDevOverDriveLevel:
case kDevMemOverDriveLevel:
case kDevDevProdName:
case kDevDevProdNum:
case kDevDevID:
case kDevDevRevID:
case kDevSubSysDevID:
case kDevSubSysVendorID:
case kDevVendorID:
case kDevPCieVendorID:
case kDevVramVendor:
case kDevVBiosVer:
case kDevPCIEThruPut:
case kDevSerialNumber:
case kDevAvailableComputePartition:
case kDevComputePartition:
case kDevMemoryPartition:
case kDevNumaNode:
case kDevXGMIPhysicalID:
case kDevXGMIPortNum:
case kDevAvailableMemoryPartition:
case kDevProcessIsolation:
case kDevSupportedXcpConfigs:
case kDevSupportedNpsConfigs:
case kDevXcpConfig:
case kDevDecoderInst:
case kDevDecoderShared:
case kDevEncoderInst:
case kDevEncoderShared:
case kDevDmaInst:
case kDevDmaShared:
case kDevJpegInst:
case kDevJpegShared:
case kDevXccInst:
case kDevXccShared:
return readDevInfoStr(type, val);
break;
default:
return EINVAL;
}
return 0;
}
void Device::DumpSupportedFunctions(void) {
SupportedFuncMapIt func_iter = supported_funcs_.begin();
std::cout << "*** Supported Functions ***" << std::endl;
while (func_iter != supported_funcs_.end()) {
std::cout << func_iter->first << std::endl;
std::cout << "\tSupported Variants(Monitors): ";
if (func_iter->second) {
VariantMapIt var_iter = func_iter->second->begin();
// We should have at least 1 supported variant or the function should
// not be listed as supported.
assert(var_iter != func_iter->second->end());
while (var_iter != func_iter->second->end()) {
std::cout << static_cast<uint32_t>(var_iter->first);
if (var_iter->second) {
std::cout << "(";
SubVariantIt mon_iter = var_iter->second->begin();
// We should have at least 1 supported monitor or the function should
// not be listed as supported.
assert(mon_iter != var_iter->second->end());
while (mon_iter != var_iter->second->end()) {
std::cout << static_cast<uint32_t>(*mon_iter) << ", ";
mon_iter++;
}
std::cout << ")";
}
std::cout << ", ";
var_iter++;
}
std::cout << std::endl;
} else {
std::cout << "Not Applicable" << std::endl;
}
func_iter++;
}
}
void Device::fillSupportedFuncs(void) {
if (!supported_funcs_.empty()) {
return;
}
std::map<const char *, dev_depends_t>::const_iterator it =
kDevFuncDependsMap.begin();
std::string dev_rt = path_ + "/device";
bool mand_depends_met;
std::shared_ptr<VariantMap> supported_variants;
while (it != kDevFuncDependsMap.end()) {
// First, see if all the mandatory dependencies are there
std::vector<const char *>::const_iterator dep =
it->second.mandatory_depends.begin();
mand_depends_met = true;
for (; dep != it->second.mandatory_depends.end(); dep++) {
std::string dep_path = dev_rt + "/" + *dep;
std::string debugfs_path;
debugfs_path = kPathDebugRootFName;
debugfs_path += std::to_string(index());
debugfs_path += "/";
debugfs_path += *dep;
if (!FileExists(dep_path.c_str()) && !FileExists(debugfs_path.c_str())) {
mand_depends_met = false;
break;
}
}
if (!mand_depends_met) {
it++;
continue;
}
// Then, see if the variants are supported.
std::vector<DevInfoTypes>::const_iterator var =
it->second.variants.begin();
if (it->second.variants.empty()) {
supported_funcs_[it->first] = nullptr;
it++;
continue;
}
supported_variants = std::make_shared<VariantMap>();
for (; var != it->second.variants.end(); var++) {
std::string variant_path = dev_rt + "/" + kDevAttribNameMap.at(*var);
if (!FileExists(variant_path.c_str())) {
continue;
}
// At this point we assume no monitors, so map to nullptr
(*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr;
}
if (!(*supported_variants).empty()) {
supported_funcs_[it->first] = supported_variants;
}
it++;
}
if (monitor() != nullptr) {
monitor()->fillSupportedFuncs(&supported_funcs_);
}
// DumpSupportedFunctions();
}
static bool subvariant_match(const std::shared_ptr<SubVariant> *sv,
uint64_t sub_v) {
assert(sv != nullptr);
SubVariantIt it = (*sv)->begin();
for (; it != (*sv)->end(); it++) {
if ((*it & MONITOR_IND_BIT_MASK) == sub_v) {
return true;
}
}
return false;
}
bool Device::DeviceAPISupported(std::string name, uint64_t variant,
uint64_t sub_variant) {
SupportedFuncMapIt func_it;
VariantMapIt var_it;
fillSupportedFuncs();
func_it = supported_funcs_.find(name);
if (func_it == supported_funcs_.end()) {
return false;
}
if (variant != RSMI_DEFAULT_VARIANT) {
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
assert(func_it->second != nullptr);
var_it = func_it->second->find(variant);
if (var_it == func_it->second->end()) {
return false;
}
if (sub_variant == RSMI_DEFAULT_VARIANT) {
return true;
}
// sub_variant != RSMI_DEFAULT_VARIANT
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
assert(var_it->second != nullptr);
return subvariant_match(&(var_it->second), sub_variant);
}
// variant == RSMI_DEFAULT_VARIANT
if (func_it->second != nullptr) {
var_it = func_it->second->find(variant);
}
if (sub_variant == RSMI_DEFAULT_VARIANT) {
return true;
}
// sub_variant != RSMI_DEFAULT_VARIANT
if (func_it->second == nullptr) {
return false;
}
return subvariant_match(&(var_it->second), sub_variant);
}
rsmi_status_t Device::restartAMDGpuDriver(void) {
REQUIRE_ROOT_ACCESS
std::ostringstream ss;
bool restartSuccessful = true;
bool success = false;
std::string out;
bool wasGdmServiceActive = false;
bool isRestartInProgress = true;
bool isAMDGPUModuleLive = false;
bool restartGDM = false;
std::string captureRestartErr;
// 1 sec = 1000 ms = 1000000 us
const int kTimeToWaitForDriverMSec = 1000;
// Attempting to speed up processing time
bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
// sudo systemctl is-active gdm
// we do not care about the success of checking if gdm is active
std::tie(success, out) = executeCommand("systemctl is-active gdm 2>/dev/null", true);
(out == "active") ? (restartGDM = true) : (restartGDM = false);
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartGDM = " << (restartGDM ? "True" : "False");
LOG_INFO(ss);
}
// if gdm is active -> sudo systemctl stop gdm
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
if (success && (out == "active") && (restartGDM)) {
wasGdmServiceActive = true;
std::tie(success, out) = executeCommand("systemctl stop gdm& 2>/dev/null", true);
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
}
} else {
success = true; // ignore failures to restart gdm
}
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
}
// sudo modprobe -r amdgpu
// sudo modprobe amdgpu
std::tie(success, out) = executeCommand(
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
restartSuccessful &= success;
captureRestartErr = out;
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
}
// if gdm was active -> sudo systemctl start gdm
// We don't care if successful or not, just try to restart as a courtesy
if (wasGdmServiceActive && restartGDM) {
std::tie(success, out) = executeCommand("systemctl start gdm& 2>/dev/null", true);
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
}
}
// Return early if there was an issue restarting amdgpu
if (!restartSuccessful) {
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | [ERROR] Issue found during amdgpu restart: "
<< captureRestartErr << "; retartSuccessful: " << (restartSuccessful ? "True" : "False");
LOG_ERROR(ss);
}
return RSMI_STATUS_AMDGPU_RESTART_ERR;
}
// wait for amdgpu module to come back up
rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress,
&isAMDGPUModuleLive);
int maxLoops = 10; // wait a max of 10 sec
while (status != RSMI_STATUS_SUCCESS) {
maxLoops -= 1;
if (maxLoops == 0) {
break;
}
amd::smi::system_wait(kTimeToWaitForDriverMSec);
status = Device::isRestartInProgress(&isRestartInProgress,
&isAMDGPUModuleLive);
}
return ((restartSuccessful && (!isRestartInProgress && isAMDGPUModuleLive)) ?
RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);
}
rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
bool *isAMDGPUModuleLive) {
REQUIRE_ROOT_ACCESS
std::ostringstream ss;
bool success = false;
std::string out;
bool deviceRestartInProgress = true; // Assume in progress, we intend to disprove
bool isSystemAMDGPUModuleLive = false; // Assume AMD GPU module is not live,
// we intend to disprove
// Attempting to speed up processing time
bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
// wait for amdgpu module to come back up
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__
<< " | success = " << (success ? "True" : "False")
<< " | out = " << out;
LOG_DEBUG(ss);
}
if ((success == true) && (!out.empty())) {
isSystemAMDGPUModuleLive = containsString(out, "live");
}
if (isAMDGPUModuleLive) {
deviceRestartInProgress = false;
}
*isRestartInProgress = deviceRestartInProgress;
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__
<< " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
<< " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
<< " | out = " << out;
LOG_DEBUG(ss);
}
return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);
}
template <typename T> rsmi_status_t storeParameter(uint32_t dv_ind);
// Stores parameters depending on which rsmi type is provided.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to
// rsmi_compute_partition_type_t or rsmi_memory_partition_type_t
// dv_ind - device index
// tempFileName - base file name
template <>
rsmi_status_t storeParameter<rsmi_compute_partition_type_t>(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
bool doesFileExist;
std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot",
"compute_partition");
// if temporary file exists -> we do not need to store anything new
// if not, read & store the state value
if (doesFileExist) {
return returnStatus;
}
const uint32_t kLen = 128;
char data[kLen];
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, data, kLen);
rsmi_status_t storeRet;
if (ret == RSMI_STATUS_SUCCESS) {
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", data);
} else if (ret == RSMI_STATUS_NOT_SUPPORTED) {
// not supported is ok
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN");
} else {
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN");
returnStatus = ret;
}
if (storeRet != RSMI_STATUS_SUCCESS) {
// file storage err takes precedence over other errors
returnStatus = storeRet;
}
return returnStatus;
}
// Stores parameters depending on which rsmi type is provided.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to
// rsmi_compute_partition_type_t or rsmi_memory_partition_type_t
// dv_ind - device index
// tempFileName - base file name
template <>
rsmi_status_t storeParameter<rsmi_memory_partition_type_t>(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
uint32_t kDatalength = 128;
char data[kDatalength];
bool doesFileExist;
std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot",
"memory_partition");
// if temporary file exists -> we do not need to store anything new
// if not, read & store the state value
if (doesFileExist) {
return returnStatus;
}
rsmi_status_t ret = rsmi_dev_memory_partition_get(dv_ind, data, kDatalength);
rsmi_status_t storeRet;
if (ret == RSMI_STATUS_SUCCESS) {
storeRet = storeTmpFile(dv_ind, "memory_partition", "boot", data);
} else if (ret == RSMI_STATUS_NOT_SUPPORTED) {
// not supported is ok
storeRet = storeTmpFile(dv_ind, "memory_partition", "boot", "UNKNOWN");
} else {
storeRet = storeTmpFile(dv_ind, "memory_partition", "boot", "UNKNOWN");
returnStatus = ret;
}
if (storeRet != RSMI_STATUS_SUCCESS) {
// file storage err takes precedence over other errors
returnStatus = storeRet;
}
return returnStatus;
}
rsmi_status_t Device::storeDevicePartitions(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
returnStatus = storeParameter<rsmi_compute_partition_type_t>(dv_ind);
rsmi_status_t ret = storeParameter<rsmi_memory_partition_type_t>(dv_ind);
if (returnStatus == RSMI_STATUS_SUCCESS) { // only record earliest error
returnStatus = ret;
}
return returnStatus;
}
// Reads a device's boot partition state, depending on which rsmi type is
// provided and device index.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to rsmi_compute_partition_type_t
// or rsmi_compute_partition_type_t
// dv_ind - device index
template <>
std::string Device::readBootPartitionState<rsmi_compute_partition_type_t>(
uint32_t dv_ind) {
std::string boot_state;
std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot",
"compute_partition");
return boot_state;
}
// Reads a device's boot partition state, depending on which rsmi type is
// provided and device index.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to rsmi_compute_partition_type_t
// or rsmi_compute_partition_type_t
// dv_ind - device index
template <>
std::string Device::readBootPartitionState<rsmi_memory_partition_type_t>(
uint32_t dv_ind) {
std::string boot_state;
std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot",
"memory_partition");
return boot_state;
}
rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id,
rsmi_device_identifiers_t *device_identifiers) {
bool found_device = false;
std::ostringstream ss;
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
if (device_identifiers == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
auto devices = smi.devices();
ss << __PRETTY_FUNCTION__ << " | device_id = " << device_id
<< "; devices.size() = " << devices.size();
// std::cout << ss.str() << "\n";
LOG_DEBUG(ss);
for (uint32_t i = 0; i < devices.size(); i++) {
if (i != device_id) {
continue;
}
device_identifiers->card_index = devices[i]->index();
device_identifiers->drm_render_minor = devices[i]->drm_render_minor();
device_identifiers->bdfid = devices[i]->bdfid();
device_identifiers->kfd_gpu_id = devices[i]->kfd_gpu_id();
uint32_t temp_partition_id = 0;
rsmi_status_t ret = rsmi_dev_partition_id_get(
i, &temp_partition_id);
if (ret != RSMI_STATUS_SUCCESS) {
temp_partition_id = 0;
}
device_identifiers->partition_id = temp_partition_id;
device_identifiers->smi_device_id = i;
found_device = true;
ss << __PRETTY_FUNCTION__ << " | Found device: "
<< "card_index = " << device_identifiers->card_index
<< "; drm_render_minor = " << device_identifiers->drm_render_minor
<< "; bdfid = " << std::hex << "0x" << device_identifiers->bdfid
<< "; kfd_gpu_id = " << std::dec << device_identifiers->kfd_gpu_id
<< "; partition_id = " << device_identifiers->partition_id
<< "; smi_device_id = " << device_identifiers->smi_device_id;
// std::cout << ss.str() << "\n";
LOG_DEBUG(ss);
break;
}
if (found_device) {
ret = RSMI_STATUS_SUCCESS;
}
return ret;
}
#undef RET_IF_NONZERO
} // namespace smi
} // namespace amd