From 83edd97f7824ace76ef4df3d9826ed21381e80aa Mon Sep 17 00:00:00 2001 From: ywang103-amd Date: Mon, 17 Mar 2025 16:20:41 -0400 Subject: [PATCH] replace rocm-smi with amd-smi cmd (#612) [ROCm/rocprofiler-compute commit: 0c6cec567102cf861948018890065b697d7e9f64] --- projects/rocprofiler-compute/CHANGELOG.md | 1 + .../src/rocprof_compute_soc/soc_base.py | 10 +++++-- .../rocprofiler-compute/src/utils/specs.py | 26 ++++++++++++------- .../tests/test_profile_general.py | 4 +-- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 71e1af9296..4b874b464f 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -16,6 +16,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Changed * Change normal_unit default to per_kernel +* change dependency from rocm-smi to amd-smi ### Resolved issues diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index eb7612f0cd..ac91a673fc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -164,8 +164,14 @@ class OmniSoC_Base: ) # we get the max mclk from rocm-smi --showmclkrange - rocm_smi_mclk = run(["rocm-smi", "--showmclkrange"], exit_on_error=True) - self._mspec.max_mclk = search(r"(\d+)Mhz\s*$", rocm_smi_mclk) + # Regular expression to extract the max memory clock (third frequency level in MEM) + memory_clock_pattern = ( + r"MEM:\s*[^:]*FREQUENCY_LEVELS:\s*(?:\d+: \d+ MHz\s*){2}(\d+)\s*MHz" + ) + amd_smi_mclk = run(["amd-smi", "static"], exit_on_error=True) + self._mspec.max_mclk = search(memory_clock_pattern, amd_smi_mclk) + + console_debug("max mem clock is {}".format(self._mspec.max_mclk)) # these are just max's now, because the parsing was broken and this was inconsistent # with how we use the clocks elsewhere (all max, all the time) diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index 6144b377da..1a457e0aa0 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -39,6 +39,7 @@ import pandas as pd import config from utils.tty import get_table_string from utils.utils import ( + console_debug, console_error, console_log, console_warning, @@ -135,18 +136,25 @@ def generate_machine_specs(args, sysinfo: dict = None): linux_distro = "" rocm_version = get_rocm_ver().strip() # FIXME: use device - vbios = search(r"VBIOS version: (.*?)$", run(["rocm-smi", "-v"], exit_on_error=True)) - compute_partition = search( - r"Compute Partition:\s*(\w+)", run(["rocm-smi", "--showcomputepartition"]) - ) + + vbios_pattern = r"PART_NUMBER:\s*(\S+)" + compute_partition_pattern = r"COMPUTE_PARTITION:\s*(\S+)" + memory_partition_pattern = r"MEMORY_PARTITION:\s*(\S+)" + + vbios = search(vbios_pattern, run(["amd-smi", "static"], exit_on_error=True)) + compute_partition = search(compute_partition_pattern, run(["amd-smi", "static"])) if compute_partition is None: compute_partition = "NA" - memory_partition = search( - r"Memory Partition:\s*(\w+)", run(["rocm-smi", "--showmemorypartition"]) - ) + memory_partition = search(memory_partition_pattern, run(["amd-smi", "static"])) if memory_partition is None: memory_partition = "NA" + console_debug( + "vbios is {}, compute partition is {}, memory partition is {}".format( + vbios, compute_partition, memory_partition + ) + ) + ########################################## ## B. SoC Specs ########################################## @@ -628,9 +636,9 @@ def run(cmd, exit_on_error=False): ) if exit_on_error: - if cmd[0] == "rocm-smi": + if cmd[0] == "amd-smi": if p.returncode != 2 and p.returncode != 0: - console_error("No GPU detected. Unable to load rocm-smi") + console_error("No GPU detected. Unable to load amd-smi") elif p.returncode != 0: console_error("Command [%s] failed with non-zero exit code" % cmd) return p.stdout.decode("utf-8") diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index a05a46ae45..eed22c4df9 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -263,8 +263,8 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5): def run(cmd): p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if cmd[0] == "rocm-smi" and p.returncode == 8: - print("ERROR: No GPU detected. Unable to load rocm-smi") + if cmd[0] == "amd-smi" and p.returncode == 8: + print("ERROR: No GPU detected. Unable to load amd-smi") assert 0 return p.stdout.decode("ascii")