[SWDEV-523794] Update to fix MIN_CLK and MAX_CLK incorrect values

(#280)

- Fixed potential issue with min/max values when only one frequency is available
- Improve error handling in GPU frequency range detection
- Refactor clock frequency range detection for better readability
- Added special handling for current frequency indicator (*) in DPM output
- Added comments explaining special case handling for current frequency
- Cleaned up incorrect definitions in hsmp metric table definition

---------

Signed-off-by: Juan Castillo <juan.castillo@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
Bu işleme şunda yer alıyor:
Castillo, Juan
2025-04-17 17:46:04 -05:00
işlemeyi yapan: GitHub
ebeveyn 581ad75729
işleme 4d92dea079
5 değiştirilmiş dosya ile 99 ekleme ve 108 silme
+52 -97
Dosyayı Görüntüle
@@ -1912,27 +1912,15 @@ class AMDSMICommands():
# Populate GFX clock values
try:
current_gfx_clocks = gpu_metric["current_gfxclks"]
if current_gfx_clocks == "N/A":
# If the current gfx clocks are not available, we cannot proceed further
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
gfx_index = f"gfx_{clock_index}"
clocks[gfx_index]["clk"] = "N/A"
clocks[gfx_index]["min_clk"] = "N/A"
clocks[gfx_index]["max_clk"] = "N/A"
clocks[gfx_index]["clk_locked"] = "N/A"
clocks[gfx_index]["deep_sleep"] = "N/A" # assume deep sleep if no clocks are available
else:
if current_gfx_clocks != "N/A":
for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
# If the current clock is N/A then nothing else applies
if current_gfx_clock == "N/A":
continue
gfx_index = f"gfx_{clock_index}"
clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
current_gfx_clock,
clock_unit)
# Populate clock locked status
if gpu_metric["gfxclk_lock_status"] != "N/A":
gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
@@ -1940,7 +1928,7 @@ class AMDSMICommands():
clocks[gfx_index]["clk_locked"] = "ENABLED"
else:
clocks[gfx_index]["clk_locked"] = "DISABLED"
except KeyError as e:
except Exception as e:
logging.debug("Failed to get current_gfxclks for gpu %s | %s", gpu_id, e)
# Populate MEM clock value
@@ -1956,51 +1944,33 @@ class AMDSMICommands():
# Populate VCLK clock values
try:
current_vclk_clocks = gpu_metric["current_vclk0s"]
if current_vclk_clocks == "N/A":
# If the current vclk clocks are not available, we cannot proceed further
for clock_index in range(kMAX_NUM_VCLKS):
vclk_index = f"vclk_{clock_index}"
clocks[vclk_index]["clk"] = "N/A"
clocks[vclk_index]["min_clk"] = "N/A"
clocks[vclk_index]["max_clk"] = "N/A"
clocks[vclk_index]["clk_locked"] = "N/A"
clocks[vclk_index]["deep_sleep"] = "N/A"
else:
# If the current vclk clocks are not available, we cannot proceed further
if current_vclk_clocks != "N/A":
for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
# If the current clock is N/A then nothing else applies
if current_vclk_clock == "N/A":
continue
vclk_index = f"vclk_{clock_index}"
clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
current_vclk_clock,
clock_unit)
except KeyError as e:
except Exception as e:
logging.debug("Failed to get current_vclk0s for gpu %s | %s", gpu_id, e)
# Populate DCLK clock values
try:
current_dclk_clocks = gpu_metric["current_dclk0s"]
if current_dclk_clocks == "N/A":
# If the current dclk clocks are not available, we cannot proceed further
for clock_index in range(kMAX_NUM_DCLKS):
dclk_index = f"dclk_{clock_index}"
clocks[dclk_index]["clk"] = "N/A"
clocks[dclk_index]["min_clk"] = "N/A"
clocks[dclk_index]["max_clk"] = "N/A"
clocks[dclk_index]["clk_locked"] = "N/A"
clocks[dclk_index]["deep_sleep"] = "N/A"
else:
# If the current dclk clocks are not available, we cannot proceed further
if current_dclk_clocks != "N/A":
for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
# If the current clock is N/A then nothing else applies
if current_dclk_clock == "N/A":
continue
dclk_index = f"dclk_{clock_index}"
clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
current_dclk_clock,
clock_unit)
except KeyError as e:
except Exception as e:
logging.debug("Failed to get current_dclk0s for gpu %s | %s", gpu_id, e)
# Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq
@@ -2017,15 +1987,8 @@ class AMDSMICommands():
# Populate SOCCLK clock value
try:
current_socclk_clock = gpu_metric["current_socclk"]
if current_socclk_clock == "N/A":
# If the current socclk clocks are not available, we cannot proceed further
clocks["socclk_0"]["clk"] = "N/A"
clocks["socclk_0"]["min_clk"] = "N/A"
clocks["socclk_0"]["max_clk"] = "N/A"
clocks["socclk_0"]["clk_locked"] = "N/A"
clocks["socclk_0"]["deep_sleep"] = "N/A"
else:
# If the current clock is N/A then nothing else applies
# If the current socclk clocks are not available, we cannot proceed further
if current_socclk_clock != "N/A":
clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger,
current_socclk_clock,
clock_unit)
@@ -2039,27 +2002,25 @@ class AMDSMICommands():
try:
gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.GFX)
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
gfx_index = f"gfx_{clock_index}"
if clocks[gfx_index]["clk"] == "N/A":
# if the current clock is N/A then we shouldn't populate the max and min values
continue
clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger,
gfx_clock_info_dict["min_clk"],
clock_unit)
clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger,
gfx_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info())
# MEM min and max clocks
try:
mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.MEM)
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks["mem_0"]["clk"] != "N/A":
clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger,
@@ -2068,49 +2029,42 @@ class AMDSMICommands():
clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger,
mem_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
# VCLK & DCLK min and max clocks
for clock_index in range(kMAX_NUM_DCLKS):
vclk_index = f"vclk_{clock_index}"
dclk_index = f"dclk_{clock_index}"
vclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
dclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
if clock_index == 0:
try:
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK0)
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK0)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk0 and/or dclk0 clock info for gpu %s | %s", gpu_id, e.get_error_info())
if clock_index == 1:
try:
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK1)
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK1)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info())
# VCLK min and max clocks
try:
for index in range(kMAX_NUM_VCLKS):
vclk_index = f"vclk_{index}"
if clocks[vclk_index]["clk"] != "N/A":
# if the current clock is N/A then we shouldn't populate the max and min values
vclk_type = amdsmi_interface.AmdSmiClkType.__dict__[f'VCLK{index}']
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, vclk_type)
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["min_clk"],
clock_unit)
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["max_clk"],
clock_unit)
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
logging.debug("Failed to get vclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
# if the current clock is N/A then we shouldn't populate the max and min values
if vclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 0:
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["min_clk"],
clock_unit)
if vclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 0:
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["max_clk"],
clock_unit)
if dclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 1:
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["min_clk"],
clock_unit)
if dclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 1:
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["max_clk"],
clock_unit)
# DCLK min and max clocks
try:
for index in range(kMAX_NUM_DCLKS):
dclk_index = f"dclk_{index}"
if clocks[dclk_index]["clk"] != "N/A":
# if the current clock is N/A then we shouldn't populate the max and min values
dclk_type = amdsmi_interface.AmdSmiClkType.__dict__[f'DCLK{index}']
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, dclk_type)
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["min_clk"],
clock_unit)
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["max_clk"],
clock_unit)
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
logging.debug("Failed to get dclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
# FCLK min and max clocks
try:
@@ -2147,13 +2101,14 @@ class AMDSMICommands():
if clocks[clock]["clk"] != "N/A" and clocks[clock]["min_clk"] != "N/A":
# Default to Disabled
clocks[clock]["deep_sleep"] = "DISABLED"
if self.logger.is_json_format():
if clocks[clock]["clk"]["value"] != "N/A" and clocks[clock]["min_clk"]["value"] != "N/A":
if clocks[clock]["clk"]["value"] < clocks[clock]["min_clk"]["value"]:
clocks[clock]["deep_sleep"] = "ENABLED"
else:
if clocks[clock]["clk"] < clocks[clock]["min_clk"]:
# Try unformat the values back
try:
clk = int(self.helpers.unit_unformat(clocks[clock]["clk"]))
min_clk = int(self.helpers.unit_unformat(clocks[clock]["clk"]))
if clk < min_clk:
clocks[clock]["deep_sleep"] = "ENABLED"
except (ValueError, TypeError) as e:
logging.debug("Failed to unformat clock values for gpu %s | %s", gpu_id, e)
else:
clocks[clock]["deep_sleep"] = "N/A"
+19
Dosyayı Görüntüle
@@ -957,12 +957,31 @@ class AMDSMIHelpers():
return:
str or dict : formatted output
"""
if value == "N/A":
return "N/A"
if logger.is_json_format():
return {"value": value, "unit": unit}
if logger.is_human_readable_format():
return f"{value} {unit}".rstrip()
return f"{value}"
def unit_unformat(self, logger, formatted_value):
"""
This function will unformat output with unit based on the logger output format
params:
logger (AMDSMILogger) - Logger to print out output
formatted_value - the value to be unformatted
return:
str or dict : unformatted output
"""
if logger.is_json_format():
if isinstance(formatted_value, dict):
return formatted_value['value']
return formatted_value
if logger.is_human_readable_format():
return formatted_value.split()[0]
return formatted_value
class SI_Unit(float, Enum):
GIGA = 1000000000 # 10^9
+1 -2
Dosyayı Görüntüle
@@ -1675,8 +1675,7 @@ def amdsmi_get_hsmp_metrics_table(
"mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc,
"mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc,
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc,
"mtbl_gfx_clk_below_host_residency_acc": mtbl.gfx_clk_below_host_residency_acc,
"mtbl_low_utilization_residency_acc": mtbl.low_utilization_residency_acc
"mtbl_gfxclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.gfxclk_frequency)]} MHz"
}
def amdsmi_first_online_core_on_cpu_socket(
+1 -1
Dosyayı Görüntüle
@@ -1178,7 +1178,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
violation_status->active_gfx_clk_below_host_limit = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED gfx_clk_below_host_residency_acc | per_gfx_clk_below_host_limit: " << std::dec
<< "ENTERED gfx_below_host_limit_acc | per_gfx_clk_below_host_limit: " << std::dec
<< violation_status->per_gfx_clk_below_host_limit
<< "%; active_ppt_pwr = " << std::dec
<< violation_status->active_gfx_clk_below_host_limit << "\n";
+26 -8
Dosyayı Görüntüle
@@ -313,13 +313,14 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
return AMDSMI_STATUS_NOT_SUPPORTED;
}
unsigned int max, min, dpm, sleep_freq;
unsigned int max, min, dpm, sleep_freq, current_freq;
char str[10];
char single_char;
max = 0;
min = UINT_MAX;
dpm = 0;
sleep_freq = UINT_MAX;
current_freq = 0;
for (std::string line; getline(ranges, line);) {
unsigned int dpm_level, freq;
@@ -331,21 +332,38 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
return AMDSMI_STATUS_NO_DATA;
}
} else {
// skip this line if it contains a * which indicates the current level
const char *current_indicator = strstr(line.c_str(), "*");
if (current_indicator != nullptr){
continue;
}
/**
* if the first line contains '*', then
* we are saving that value as current_freq then checking
* for other dpm levels if none are found then we
* set min and max to current_freq as per Driver
* We then skip to the next line to avoid getting
* incorrect min value.
*/
if (sscanf(line.c_str(), "%u: %d%c", &dpm_level, &freq, str) <= 2){
ranges.close();
return AMDSMI_STATUS_IO;
}
char lastChar = line.back();
if (lastChar == '*'){
current_freq = freq;
continue;
}
// not * was detected so check for the min max
max = freq > max ? freq : max;
min = freq < min ? freq: min;
min = freq < min ? freq : min;
dpm = dpm_level > dpm ? dpm_level : dpm;
}
}
if (dpm == 0 && current_freq > 0) {
// if the dpm level is 0, then the current frequency is the min/max frequency
max = current_freq;
min = current_freq;
}
if (num_dpm)
*num_dpm = dpm;
if (max_freq)