From a75b7f741cea551d0c35e21e0cf9599e91117ddd Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 11 Jul 2023 06:58:00 -0500 Subject: [PATCH] Fix rsmitstReadWrite.TestPowerReadWrite test failure Code changes related to the following: * All reinforcement work moved to their own files * Self contained changes only to support them * New files added to CMakeLists.txt Change-Id: I761e91f54392824df9145eaed8b9805986861285 Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: cc5ab079dfcdf0106f0a9482ae1e88c08d900957] --- projects/rocm-smi-lib/CMakeLists.txt | 2 + .../include/rocm_smi/rocm_smi_device.h | 9 +- .../include/rocm_smi/rocm_smi_properties.h | 160 +++++ .../include/rocm_smi/rocm_smi_utils.h | 1 + projects/rocm-smi-lib/src/rocm_smi.cc | 2 + projects/rocm-smi-lib/src/rocm_smi_device.cc | 2 + .../rocm-smi-lib/src/rocm_smi_properties.cc | 560 ++++++++++++++++++ projects/rocm-smi-lib/src/rocm_smi_utils.cc | 23 + 8 files changed, 758 insertions(+), 1 deletion(-) create mode 100644 projects/rocm-smi-lib/include/rocm_smi/rocm_smi_properties.h create mode 100644 projects/rocm-smi-lib/src/rocm_smi_properties.cc diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 87117c9fa6..dd4c5d53f2 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -133,6 +133,7 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_gpu_metrics.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_logger.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_properties.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc") set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h") @@ -147,6 +148,7 @@ set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_properties.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h") ## set components diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 3dcf7e1345..a1b2809457 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -52,12 +52,14 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_counters.h" +#include "rocm_smi/rocm_smi_properties.h" #include "shared_mutex.h" //NOLINT namespace amd { @@ -173,6 +175,7 @@ typedef struct { std::vector variants; } dev_depends_t; + class Device { public: explicit Device(std::string path, RocmSMI_env_vars const *e); @@ -213,7 +216,7 @@ class Device { void set_evt_notif_anon_fd(uint32_t fd) { evt_notif_anon_fd_ = static_cast(fd);} int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;} - metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;} + metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;} void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -221,6 +224,8 @@ class Device { rsmi_status_t restartAMDGpuDriver(void); rsmi_status_t storeDevicePartitions(uint32_t dv_ind); template std::string readBootPartitionState(uint32_t dv_ind); + rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); + private: std::shared_ptr monitor_; @@ -241,6 +246,7 @@ class Device { int readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data); int writeDevInfoStr(DevInfoTypes type, std::string valStr); + rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set, + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ +#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" + +#include +#include + + +namespace amd { +namespace smi { + +// +// Property reinforcement check list +// +using AMDGpuPropertyId_t = uint32_t; +using AMDGpuDevIdx_t = uint32_t; +using AMDGpuVerbId_t = uint32_t; +using AMDGpuAsicId_t = uint16_t; +using AMDGpuAsicRevId_t = uint16_t; +using AMDGpuOpModeType_t = uint8_t; + +enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t +{ + kNone = 0, + kSetGpuPciBandwidth, + kSetPowerCap, + kSetGpuPowerProfile, + kSetGpuClkRange, + kSetGpuOdClkInfo, + kSetGpuOdVoltInfo, + kSetGpuPerfLevelV1, + kSetGpuPerfLevel, + kGetGpuPowerProfilePresets, + kResetGpu, + kSetGpuPerfDeterminismMode, + kSetGpuFanSpeed, + kResetGpuFan, + kSetClkFreq, + kSetGpuOverdriveLevelV1, + kSetGpuOverdriveLevel, + kGetGpuFanRpms, + kGetGpuFanSpeed, + kGetGpuFanSpeedMax, + kGetGpuVoltMetric, + kGetGpuOverDriveLevel, + kGetGpuOdVoltInfo, + kGetGpuOdVoltCurveRegions, +}; +using AMDGpuVerbList_t = std::map; + + +enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t +{ + kNone = 0, + kDevInfoTypes = (0x1000 << 0), + kMonitorTypes = (0x1000 << 1), + kPerfTypes = (0x1000 << 2), + kClkTypes = (0x1000 << 3), + kVoltMetricTypes = (0x1000 << 4), +}; + +using AMDGpuPropertyOffsetType = std::underlying_type::type; +using AMDGpuPropertyTypesOffsetList_t = std::map; +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); + + +enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t +{ + kBareMetal = (0x1 << 0), + kSrIov = (0x1 << 1), + kBoth = (0x1 << 2), +}; + +using AMDGpuPropertyOpModeType = std::underlying_type::type; +using AMDGpuOpModeList_t = std::map; +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); + + +struct AMDGpuProperties_t +{ + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; + AMDGpuPropertyOpModeTypes_t m_opmode; + bool m_should_be_available; +}; +using AMDGpuPropertyList_t = std::multimap; + +struct AMDGpuPropertyQuery_t +{ + AMDGpuAsicId_t m_asic_id; + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuDevIdx_t m_dev_idx; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; +}; + + +// +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id); +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id); + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, + AMDGpuVerbTypes_t dev_info_type, + rsmi_status_t actual_error_code); + +void dump_amdgpu_property_reinforcement_list(); + + +} // namespace smi +} // namespace amd + +#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index a655b5b136..c574c97508 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -100,6 +100,7 @@ std::tuple getSystemDetails(void); void logSystemDetails(void); +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 61128c03eb..9d5ea6a367 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -2957,6 +2957,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved, DEVICE_MUTEX rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr); + return ret; CATCH } @@ -2973,6 +2974,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy, (void)dummy; DEVICE_MUTEX rsmi_status_t ret = set_power_profile(dv_ind, profile); + return ret; CATCH } diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 87077195ce..ddaf41a44a 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -59,6 +59,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" @@ -1385,6 +1386,7 @@ std::string Device::readBootPartitionState( return boot_state; } + #undef RET_IF_NONZERO } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_properties.cc b/projects/rocm-smi-lib/src/rocm_smi_properties.cc new file mode 100644 index 0000000000..0e606e6874 --- /dev/null +++ b/projects/rocm-smi-lib/src/rocm_smi_properties.cc @@ -0,0 +1,560 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include "rocm_smi/rocm_smi_properties.h" +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_logger.h" + +#include +#include +#include + + +// +// Property reinforcement check list +// +// NOTE: This is a *temporary solution* until we get a better approach, likely +// a driver API that can give us the capabilities of a GPU in question. +// +namespace amd { +namespace smi { + +const AMDGpuOpModeList_t amdgpu_opmode_check_list { + {AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"}, + {AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"}, + {AMDGpuPropertyOpModeTypes_t::kBoth, "Both"}, +}; + +const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list { + {AMDGpuPropertyTypesOffset_t::kNone, "None"}, + {AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"}, + {AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"}, + {AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"}, + {AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"}, + {AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"}, +}; + + +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) { + return (static_cast(type_offset) | (property_id)); +} + +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { + const auto property_type_offset_mask = + static_cast(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kMonitorTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kPerfTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); + + auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); + auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); + + return property_type_id; +} + +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) & static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) & static_cast(rhs)); +} + + +// +// Note: Due to the fact that we have different enum elements with the same +// number, keying a hash by the number is not an option; ie: +// - DevInfoTypes::kDevVendorID = 7 +// - MonitorTypes::kMonPowerCapDefault = 7 +// So, we are keying it by a unique key, based on their info types +// +const AMDGpuVerbList_t amdgpu_verb_check_list { + { AMDGpuVerbTypes_t::kNone, "None" }, + { AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" }, + { AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" }, + { AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" }, + { AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" }, + { AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" }, + { AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" }, + { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" }, + { AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" }, + { AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" }, + { AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" }, + { AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" }, + { AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" }, + { AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } +}; + +const uint16_t kDevRevIDAll(0xFFFF); +const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { + // + // {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}} + // DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set + // MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get; + // DevInfoTypes::kDevPowerProfileMode = + // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; + // + + // AMD Instinct MI210 + {0x740F, {0x02, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + + // AMD MIxxx + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPCIEClk), + AMDGpuVerbTypes_t::kSetGpuPciBandwidth, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonPowerCapDefault), + AMDGpuVerbTypes_t::kSetPowerCap, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuClkRange, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdClkInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO), + AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuPerfLevel, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevGpuReset), + AMDGpuVerbTypes_t::kResetGpu, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), + AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kSetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes, + rsmi_clk_type::RSMI_CLK_TYPE_FIRST), + AMDGpuVerbTypes_t::kSetClkFreq, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanRPMs), + AMDGpuVerbTypes_t::kGetGpuFanRpms, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonMaxFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, + rsmi_voltage_metric_t::RSMI_VOLT_CURRENT), + AMDGpuVerbTypes_t::kGetGpuVoltMetric, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + } +}; + + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code) +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n"; + LOG_TRACE(osstream); + + if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + return actual_error_code; + } + + // + // For property reinforcement query, the possible return values are: + // RSMI_STATUS_SUCCESS: + // - Property found in the reinforcement table, and it *should exist* + // RSMI_STATUS_NOT_SUPPORTED: + // - Property found in the reinforcement table, and it *should not* exist + // RSMI_STATUS_NO_DATA: + // - Could not find the correct dev_id and dev_revision info to build the filter + // RSMI_STATUS_UNKNOWN_ERROR: + // - The results are initialized with that. If that is returned, + // likely the reinforcement table does not contain any entries/rules for the + // dev_id in question. + // + auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + switch (query_result) { + case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): + case (rsmi_status_t::RSMI_STATUS_NO_DATA): + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + + case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED): + case (rsmi_status_t::RSMI_STATUS_SUCCESS): + return query_result; + break; + + default: + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + } + }; + + /// + GET_DEV_FROM_INDX + osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query =======" + << " [query filters: ]" + << " device: " << dv_ind + << " property/verb: " << static_cast(verb_type) << amdgpu_verb_check_list.at(verb_type); + auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + return reinforcement_query_result; +} + +void dump_amdgpu_property_reinforcement_list() +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + if (!amdgpu_property_reinforcement_list.empty()) { + for (const auto& property : amdgpu_property_reinforcement_list) { + osstream << __PRETTY_FUNCTION__ + << " Asic ID: " << property.first + << " Asic Rev.ID: " << property.second.m_pci_rev_id + << " Property ID: " << property.second.m_property + << " Verb ID : " << static_cast(property.second.m_verb_id) + << " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id) + << " OpMode: " << static_cast(property.second.m_opmode) + << " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode) + << " Flag Avail.: " << property.second.m_should_be_available; + } + osstream << __PRETTY_FUNCTION__ << "| ======= end ======="; + return; + } + + osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty"; + LOG_TRACE(osstream); +} + + +rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + AMDGpuPropertyQuery_t amdgpu_property_query_init{}; + amdgpu_property_query_init.m_asic_id = 0; + amdgpu_property_query_init.m_pci_rev_id = 0; + amdgpu_property_query_init.m_dev_idx = dev_idx; + amdgpu_property_query_init.m_property = 0; + amdgpu_property_query_init.m_verb_id = verb_type; + return amdgpu_property_query_init; + }(); + + auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) { + auto tmp_amdgpu_query = amdgpu_query_validate; + auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS); + if (amdgpu_query_validate.m_asic_id == 0) { + id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id); + if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) { + id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); + } + } + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + return tmp_amdgpu_query; + }; + + // If the original amdgpu_query is missing parts of the filter, such as; + // asic_id, revision_id, we try to retrieve them based on the dev_idx. + // the property we are searching for, *must be present* . + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(osstream); + + bool is_proper_query(false); + amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); + if (!is_proper_query) { + rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << ", Missing Query Filters were not successfully retrieved: " + << " [query filters: ]" + << " device: " << dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property: " << amdgpu_property_query.m_property + << " verb: " << static_cast(amdgpu_property_query.m_verb_id) + << " proper_query: " << is_proper_query + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; + } + + return run_amdgpu_property_reinforcement_query(amdgpu_property_query); +} + +rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + auto contains = [](const uint16_t asic_id) { + return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); + }; + + auto ends_with = [](const std::string& value, const std::string& ending) { + if (value.size() < ending.size()) { + return false; + } + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); + }; + + // Traverse through all values for a given key + osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; + LOG_TRACE(osstream); + if (contains(amdgpu_property_query.m_asic_id)) { + osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n"; + auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id); + auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id); + while (itr_begin != itr_end) { + // Still same key, and... + if (itr_begin->first == amdgpu_property_query.m_asic_id) { + osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; + // Pci_rev_id matches the filter or ALL Revisions + if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; + // Do we have the property we are looking for? + if (((amdgpu_property_query.m_property != 0) && + (itr_begin->second.m_property == amdgpu_property_query.m_property)) || + ((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) && + (itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) { + osstream << __PRETTY_FUNCTION__ + << " property found: " << itr_begin->second.m_property + << " verb found: " << static_cast(itr_begin->second.m_verb_id) + << " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id) + << " should_be_available: " << itr_begin->second.m_should_be_available << "\n"; + // and if we do, should we consider it available, or forcefully + // considered it unavailable + osstream << __PRETTY_FUNCTION__ << "| ======= validating =======" + << ", Property found in the table for this device and flagged as *Not Available* : " + << " [query filters: ]" + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id; + // + // The property is set in the reinforcement table to 'it should not be available' + if (!itr_begin->second.m_should_be_available) { + // If the property is found and set to not available + // (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED), + // it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + // + // The property is set in the reinforcement table to 'it should be available' + rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + } + } + itr_begin++; + } + } + + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << "Done searching for the Property in reinforcement table for this device: " + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property id: " << amdgpu_property_query.m_property + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; +} + + +} // namespace smi +} // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 3c997ccf9d..7f1268a995 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -748,5 +749,27 @@ bool isSystemBigEndian() { return isBigEndian; } +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str) +{ + auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; + auto bus_id = static_cast((bdf_id & 0x0000FF00) >> 8); + auto dev_id = static_cast((bdf_id & 0x000000F8) >> 3); + auto func_id = static_cast(bdf_id & 0x00000003); + + bfd_str = std::string(); + if (!(bus_id > 0)) { + result = rsmi_status_t::RSMI_STATUS_NO_DATA; + return result; + } + + std::stringstream bdf_sstream; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +bus_id << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +dev_id << "."; + bdf_sstream << std::hex << std::setfill('0') << +func_id; + bfd_str = bdf_sstream.str(); + return result; +} + + } // namespace smi } // namespace amd