replace rocm-smi with amd-smi cmd (#612)

[ROCm/rocprofiler-compute commit: 0c6cec5671]
This commit is contained in:
ywang103-amd
2025-03-17 16:20:41 -04:00
committed by GitHub
orang tua eba173de5e
melakukan 83edd97f78
4 mengubah file dengan 28 tambahan dan 13 penghapusan
@@ -16,6 +16,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
### Changed
* Change normal_unit default to per_kernel
* change dependency from rocm-smi to amd-smi
### Resolved issues
@@ -164,8 +164,14 @@ class OmniSoC_Base:
)
# we get the max mclk from rocm-smi --showmclkrange
rocm_smi_mclk = run(["rocm-smi", "--showmclkrange"], exit_on_error=True)
self._mspec.max_mclk = search(r"(\d+)Mhz\s*$", rocm_smi_mclk)
# Regular expression to extract the max memory clock (third frequency level in MEM)
memory_clock_pattern = (
r"MEM:\s*[^:]*FREQUENCY_LEVELS:\s*(?:\d+: \d+ MHz\s*){2}(\d+)\s*MHz"
)
amd_smi_mclk = run(["amd-smi", "static"], exit_on_error=True)
self._mspec.max_mclk = search(memory_clock_pattern, amd_smi_mclk)
console_debug("max mem clock is {}".format(self._mspec.max_mclk))
# these are just max's now, because the parsing was broken and this was inconsistent
# with how we use the clocks elsewhere (all max, all the time)
@@ -39,6 +39,7 @@ import pandas as pd
import config
from utils.tty import get_table_string
from utils.utils import (
console_debug,
console_error,
console_log,
console_warning,
@@ -135,18 +136,25 @@ def generate_machine_specs(args, sysinfo: dict = None):
linux_distro = ""
rocm_version = get_rocm_ver().strip()
# FIXME: use device
vbios = search(r"VBIOS version: (.*?)$", run(["rocm-smi", "-v"], exit_on_error=True))
compute_partition = search(
r"Compute Partition:\s*(\w+)", run(["rocm-smi", "--showcomputepartition"])
)
vbios_pattern = r"PART_NUMBER:\s*(\S+)"
compute_partition_pattern = r"COMPUTE_PARTITION:\s*(\S+)"
memory_partition_pattern = r"MEMORY_PARTITION:\s*(\S+)"
vbios = search(vbios_pattern, run(["amd-smi", "static"], exit_on_error=True))
compute_partition = search(compute_partition_pattern, run(["amd-smi", "static"]))
if compute_partition is None:
compute_partition = "NA"
memory_partition = search(
r"Memory Partition:\s*(\w+)", run(["rocm-smi", "--showmemorypartition"])
)
memory_partition = search(memory_partition_pattern, run(["amd-smi", "static"]))
if memory_partition is None:
memory_partition = "NA"
console_debug(
"vbios is {}, compute partition is {}, memory partition is {}".format(
vbios, compute_partition, memory_partition
)
)
##########################################
## B. SoC Specs
##########################################
@@ -628,9 +636,9 @@ def run(cmd, exit_on_error=False):
)
if exit_on_error:
if cmd[0] == "rocm-smi":
if cmd[0] == "amd-smi":
if p.returncode != 2 and p.returncode != 0:
console_error("No GPU detected. Unable to load rocm-smi")
console_error("No GPU detected. Unable to load amd-smi")
elif p.returncode != 0:
console_error("Command [%s] failed with non-zero exit code" % cmd)
return p.stdout.decode("utf-8")
@@ -263,8 +263,8 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5):
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "rocm-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load rocm-smi")
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("ascii")