[SWDEV-523794] Update to fix MIN_CLK and MAX_CLK incorrect values
(#280) - Fixed potential issue with min/max values when only one frequency is available - Improve error handling in GPU frequency range detection - Refactor clock frequency range detection for better readability - Added special handling for current frequency indicator (*) in DPM output - Added comments explaining special case handling for current frequency - Cleaned up incorrect definitions in hsmp metric table definition --------- Signed-off-by: Juan Castillo <juan.castillo@amd.com> Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
581ad75729
işleme
4d92dea079
@@ -1912,27 +1912,15 @@ class AMDSMICommands():
|
||||
# Populate GFX clock values
|
||||
try:
|
||||
current_gfx_clocks = gpu_metric["current_gfxclks"]
|
||||
if current_gfx_clocks == "N/A":
|
||||
# If the current gfx clocks are not available, we cannot proceed further
|
||||
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
clocks[gfx_index]["clk"] = "N/A"
|
||||
clocks[gfx_index]["min_clk"] = "N/A"
|
||||
clocks[gfx_index]["max_clk"] = "N/A"
|
||||
clocks[gfx_index]["clk_locked"] = "N/A"
|
||||
clocks[gfx_index]["deep_sleep"] = "N/A" # assume deep sleep if no clocks are available
|
||||
|
||||
else:
|
||||
if current_gfx_clocks != "N/A":
|
||||
for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_gfx_clock == "N/A":
|
||||
continue
|
||||
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_gfx_clock,
|
||||
clock_unit)
|
||||
|
||||
# Populate clock locked status
|
||||
if gpu_metric["gfxclk_lock_status"] != "N/A":
|
||||
gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
|
||||
@@ -1940,7 +1928,7 @@ class AMDSMICommands():
|
||||
clocks[gfx_index]["clk_locked"] = "ENABLED"
|
||||
else:
|
||||
clocks[gfx_index]["clk_locked"] = "DISABLED"
|
||||
except KeyError as e:
|
||||
except Exception as e:
|
||||
logging.debug("Failed to get current_gfxclks for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate MEM clock value
|
||||
@@ -1956,51 +1944,33 @@ class AMDSMICommands():
|
||||
# Populate VCLK clock values
|
||||
try:
|
||||
current_vclk_clocks = gpu_metric["current_vclk0s"]
|
||||
if current_vclk_clocks == "N/A":
|
||||
# If the current vclk clocks are not available, we cannot proceed further
|
||||
for clock_index in range(kMAX_NUM_VCLKS):
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
clocks[vclk_index]["clk"] = "N/A"
|
||||
clocks[vclk_index]["min_clk"] = "N/A"
|
||||
clocks[vclk_index]["max_clk"] = "N/A"
|
||||
clocks[vclk_index]["clk_locked"] = "N/A"
|
||||
clocks[vclk_index]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
# If the current vclk clocks are not available, we cannot proceed further
|
||||
if current_vclk_clocks != "N/A":
|
||||
for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_vclk_clock == "N/A":
|
||||
continue
|
||||
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_vclk_clock,
|
||||
clock_unit)
|
||||
except KeyError as e:
|
||||
except Exception as e:
|
||||
logging.debug("Failed to get current_vclk0s for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate DCLK clock values
|
||||
try:
|
||||
current_dclk_clocks = gpu_metric["current_dclk0s"]
|
||||
if current_dclk_clocks == "N/A":
|
||||
# If the current dclk clocks are not available, we cannot proceed further
|
||||
for clock_index in range(kMAX_NUM_DCLKS):
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
clocks[dclk_index]["clk"] = "N/A"
|
||||
clocks[dclk_index]["min_clk"] = "N/A"
|
||||
clocks[dclk_index]["max_clk"] = "N/A"
|
||||
clocks[dclk_index]["clk_locked"] = "N/A"
|
||||
clocks[dclk_index]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
# If the current dclk clocks are not available, we cannot proceed further
|
||||
if current_dclk_clocks != "N/A":
|
||||
for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_dclk_clock == "N/A":
|
||||
continue
|
||||
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_dclk_clock,
|
||||
clock_unit)
|
||||
except KeyError as e:
|
||||
except Exception as e:
|
||||
logging.debug("Failed to get current_dclk0s for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq
|
||||
@@ -2017,15 +1987,8 @@ class AMDSMICommands():
|
||||
# Populate SOCCLK clock value
|
||||
try:
|
||||
current_socclk_clock = gpu_metric["current_socclk"]
|
||||
if current_socclk_clock == "N/A":
|
||||
# If the current socclk clocks are not available, we cannot proceed further
|
||||
clocks["socclk_0"]["clk"] = "N/A"
|
||||
clocks["socclk_0"]["min_clk"] = "N/A"
|
||||
clocks["socclk_0"]["max_clk"] = "N/A"
|
||||
clocks["socclk_0"]["clk_locked"] = "N/A"
|
||||
clocks["socclk_0"]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
# If the current clock is N/A then nothing else applies
|
||||
# If the current socclk clocks are not available, we cannot proceed further
|
||||
if current_socclk_clock != "N/A":
|
||||
clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_socclk_clock,
|
||||
clock_unit)
|
||||
@@ -2039,27 +2002,25 @@ class AMDSMICommands():
|
||||
try:
|
||||
gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.GFX)
|
||||
|
||||
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
|
||||
if clocks[gfx_index]["clk"] == "N/A":
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
continue
|
||||
|
||||
clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
gfx_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
gfx_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# MEM min and max clocks
|
||||
try:
|
||||
mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.MEM)
|
||||
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
if clocks["mem_0"]["clk"] != "N/A":
|
||||
clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
@@ -2068,49 +2029,42 @@ class AMDSMICommands():
|
||||
clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
mem_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# VCLK & DCLK min and max clocks
|
||||
for clock_index in range(kMAX_NUM_DCLKS):
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
vclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
|
||||
dclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
|
||||
if clock_index == 0:
|
||||
try:
|
||||
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.VCLK0)
|
||||
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.DCLK0)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get vclk0 and/or dclk0 clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if clock_index == 1:
|
||||
try:
|
||||
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.VCLK1)
|
||||
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
|
||||
amdsmi_interface.AmdSmiClkType.DCLK1)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
# VCLK min and max clocks
|
||||
try:
|
||||
for index in range(kMAX_NUM_VCLKS):
|
||||
vclk_index = f"vclk_{index}"
|
||||
if clocks[vclk_index]["clk"] != "N/A":
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
vclk_type = amdsmi_interface.AmdSmiClkType.__dict__[f'VCLK{index}']
|
||||
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, vclk_type)
|
||||
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
logging.debug("Failed to get vclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
if vclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 0:
|
||||
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
if vclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 0:
|
||||
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
if dclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 1:
|
||||
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
if dclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 1:
|
||||
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
# DCLK min and max clocks
|
||||
try:
|
||||
for index in range(kMAX_NUM_DCLKS):
|
||||
dclk_index = f"dclk_{index}"
|
||||
if clocks[dclk_index]["clk"] != "N/A":
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
dclk_type = amdsmi_interface.AmdSmiClkType.__dict__[f'DCLK{index}']
|
||||
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, dclk_type)
|
||||
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
logging.debug("Failed to get dclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# FCLK min and max clocks
|
||||
try:
|
||||
@@ -2147,13 +2101,14 @@ class AMDSMICommands():
|
||||
if clocks[clock]["clk"] != "N/A" and clocks[clock]["min_clk"] != "N/A":
|
||||
# Default to Disabled
|
||||
clocks[clock]["deep_sleep"] = "DISABLED"
|
||||
if self.logger.is_json_format():
|
||||
if clocks[clock]["clk"]["value"] != "N/A" and clocks[clock]["min_clk"]["value"] != "N/A":
|
||||
if clocks[clock]["clk"]["value"] < clocks[clock]["min_clk"]["value"]:
|
||||
clocks[clock]["deep_sleep"] = "ENABLED"
|
||||
else:
|
||||
if clocks[clock]["clk"] < clocks[clock]["min_clk"]:
|
||||
# Try unformat the values back
|
||||
try:
|
||||
clk = int(self.helpers.unit_unformat(clocks[clock]["clk"]))
|
||||
min_clk = int(self.helpers.unit_unformat(clocks[clock]["clk"]))
|
||||
if clk < min_clk:
|
||||
clocks[clock]["deep_sleep"] = "ENABLED"
|
||||
except (ValueError, TypeError) as e:
|
||||
logging.debug("Failed to unformat clock values for gpu %s | %s", gpu_id, e)
|
||||
else:
|
||||
clocks[clock]["deep_sleep"] = "N/A"
|
||||
|
||||
|
||||
@@ -957,12 +957,31 @@ class AMDSMIHelpers():
|
||||
return:
|
||||
str or dict : formatted output
|
||||
"""
|
||||
if value == "N/A":
|
||||
return "N/A"
|
||||
if logger.is_json_format():
|
||||
return {"value": value, "unit": unit}
|
||||
if logger.is_human_readable_format():
|
||||
return f"{value} {unit}".rstrip()
|
||||
return f"{value}"
|
||||
|
||||
def unit_unformat(self, logger, formatted_value):
|
||||
"""
|
||||
This function will unformat output with unit based on the logger output format
|
||||
params:
|
||||
logger (AMDSMILogger) - Logger to print out output
|
||||
formatted_value - the value to be unformatted
|
||||
return:
|
||||
str or dict : unformatted output
|
||||
"""
|
||||
if logger.is_json_format():
|
||||
if isinstance(formatted_value, dict):
|
||||
return formatted_value['value']
|
||||
return formatted_value
|
||||
if logger.is_human_readable_format():
|
||||
return formatted_value.split()[0]
|
||||
return formatted_value
|
||||
|
||||
|
||||
class SI_Unit(float, Enum):
|
||||
GIGA = 1000000000 # 10^9
|
||||
|
||||
@@ -1675,8 +1675,7 @@ def amdsmi_get_hsmp_metrics_table(
|
||||
"mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc,
|
||||
"mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc,
|
||||
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc,
|
||||
"mtbl_gfx_clk_below_host_residency_acc": mtbl.gfx_clk_below_host_residency_acc,
|
||||
"mtbl_low_utilization_residency_acc": mtbl.low_utilization_residency_acc
|
||||
"mtbl_gfxclk_frequency": f"{[round(x*fraction_uq10 ,3) for x in list(mtbl.gfxclk_frequency)]} MHz"
|
||||
}
|
||||
|
||||
def amdsmi_first_online_core_on_cpu_socket(
|
||||
|
||||
@@ -1178,7 +1178,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
violation_status->active_gfx_clk_below_host_limit = 0;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "ENTERED gfx_clk_below_host_residency_acc | per_gfx_clk_below_host_limit: " << std::dec
|
||||
<< "ENTERED gfx_below_host_limit_acc | per_gfx_clk_below_host_limit: " << std::dec
|
||||
<< violation_status->per_gfx_clk_below_host_limit
|
||||
<< "%; active_ppt_pwr = " << std::dec
|
||||
<< violation_status->active_gfx_clk_below_host_limit << "\n";
|
||||
|
||||
@@ -313,13 +313,14 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
unsigned int max, min, dpm, sleep_freq;
|
||||
unsigned int max, min, dpm, sleep_freq, current_freq;
|
||||
char str[10];
|
||||
char single_char;
|
||||
max = 0;
|
||||
min = UINT_MAX;
|
||||
dpm = 0;
|
||||
sleep_freq = UINT_MAX;
|
||||
current_freq = 0;
|
||||
|
||||
for (std::string line; getline(ranges, line);) {
|
||||
unsigned int dpm_level, freq;
|
||||
@@ -331,21 +332,38 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
|
||||
return AMDSMI_STATUS_NO_DATA;
|
||||
}
|
||||
} else {
|
||||
// skip this line if it contains a * which indicates the current level
|
||||
const char *current_indicator = strstr(line.c_str(), "*");
|
||||
if (current_indicator != nullptr){
|
||||
continue;
|
||||
}
|
||||
/**
|
||||
* if the first line contains '*', then
|
||||
* we are saving that value as current_freq then checking
|
||||
* for other dpm levels if none are found then we
|
||||
* set min and max to current_freq as per Driver
|
||||
* We then skip to the next line to avoid getting
|
||||
* incorrect min value.
|
||||
*/
|
||||
|
||||
if (sscanf(line.c_str(), "%u: %d%c", &dpm_level, &freq, str) <= 2){
|
||||
ranges.close();
|
||||
return AMDSMI_STATUS_IO;
|
||||
}
|
||||
|
||||
char lastChar = line.back();
|
||||
if (lastChar == '*'){
|
||||
current_freq = freq;
|
||||
continue;
|
||||
}
|
||||
|
||||
// not * was detected so check for the min max
|
||||
max = freq > max ? freq : max;
|
||||
min = freq < min ? freq: min;
|
||||
min = freq < min ? freq : min;
|
||||
dpm = dpm_level > dpm ? dpm_level : dpm;
|
||||
}
|
||||
|
||||
}
|
||||
if (dpm == 0 && current_freq > 0) {
|
||||
// if the dpm level is 0, then the current frequency is the min/max frequency
|
||||
max = current_freq;
|
||||
min = current_freq;
|
||||
}
|
||||
|
||||
if (num_dpm)
|
||||
*num_dpm = dpm;
|
||||
if (max_freq)
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle