From 2e7f82aa13aa0584fee7a129ea2130e38ba07fd6 Mon Sep 17 00:00:00 2001 From: xuchen-amd Date: Fri, 21 Mar 2025 02:02:58 -0400 Subject: [PATCH] Improve chip info logic. (#581) * Clean up unused functions. * Fix number of XCDs for MI300X CPX (core partition). * Add support for memory partition mode. * Modify total_xcd to adapt to all gpu models. * Run black and isort. * Make gpu_arch regex more generic. * Add error checking for compute partition mode num xcds. * Set gpu_chip_id as optional. * Fix get_gpu_model. --------- Signed-off-by: xuchen-amd --- src/rocprof_compute_base.py | 23 +- src/rocprof_compute_soc/soc_base.py | 33 ++- src/utils/mi_gpu_spec.py | 338 ++++++++++++++++++++++++++++ src/utils/mi_gpu_spec.yaml | 164 ++++++++++++++ src/utils/specs.py | 59 +++-- src/utils/utils.py | 51 +++-- 6 files changed, 594 insertions(+), 74 deletions(-) create mode 100644 src/utils/mi_gpu_spec.py create mode 100644 src/utils/mi_gpu_spec.yaml diff --git a/src/rocprof_compute_base.py b/src/rocprof_compute_base.py index cd97676fde..e81615db1d 100644 --- a/src/rocprof_compute_base.py +++ b/src/rocprof_compute_base.py @@ -24,7 +24,6 @@ import argparse import importlib -import logging import os import shutil import socket @@ -43,6 +42,10 @@ from utils.logger import ( setup_file_handler, setup_logging_priority, ) +from utils.mi_gpu_spec import ( + get_gpu_series_dict, + parse_mi_gpu_spec, +) from utils.specs import MachineSpecs, generate_machine_specs from utils.utils import ( console_debug, @@ -57,21 +60,6 @@ from utils.utils import ( set_locale_encoding, ) -SUPPORTED_ARCHS = { - "gfx906": {"mi50": ["MI50", "MI60"]}, - "gfx908": {"mi100": ["MI100"]}, - "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, - "gfx940": {"mi300": ["MI300A_A0"]}, - "gfx941": {"mi300": ["MI300X_A0"]}, - "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, -} - -MI300_CHIP_IDS = { - "29856": "MI300A_A1", - "29857": "MI300X_A1", - "29858": "MI308X", -} - class RocProfCompute: def __init__(self): @@ -87,7 +75,8 @@ class RocProfCompute: "ver_pretty": None, } self.__options = {} - self.__supported_archs = SUPPORTED_ARCHS + parse_mi_gpu_spec() + self.__supported_archs = get_gpu_series_dict() self.__mspec: MachineSpecs = None # to be initalized in load_soc_specs() setup_console_handler() self.set_version() diff --git a/src/rocprof_compute_soc/soc_base.py b/src/rocprof_compute_soc/soc_base.py index 817f1c94b6..fea2901111 100644 --- a/src/rocprof_compute_soc/soc_base.py +++ b/src/rocprof_compute_soc/soc_base.py @@ -27,6 +27,7 @@ import math import os import re import shutil +import sys import threading from abc import ABC, abstractmethod from collections import OrderedDict @@ -36,7 +37,7 @@ import numpy as np import pandas as pd import yaml -from rocprof_compute_base import MI300_CHIP_IDS, SUPPORTED_ARCHS +from utils.mi_gpu_spec import get_gpu_model, get_gpu_series from utils.parser import build_in_vars, supported_denom from utils.utils import ( console_debug, @@ -45,6 +46,7 @@ from utils.utils import ( convert_metric_id_to_panel_idx, demarcate, get_default_accumulate_counter_file_ymal, + total_xcds, using_v3, ) @@ -53,6 +55,7 @@ class OmniSoC_Base: def __init__( self, args, mspec ): # new info field will contain rocminfo or sysinfo to populate properties + console_debug("[omnisoc init]") self.__args = args self.__arch = None self._mspec = mspec @@ -102,7 +105,8 @@ class OmniSoC_Base: @demarcate def populate_mspec(self): - from utils.specs import run, search, total_sqc, total_xcds + console_debug("[populate_mspec]") + from utils.specs import run, search, total_sqc if not hasattr(self._mspec, "_rocminfo") or self._mspec._rocminfo is None: return @@ -151,11 +155,6 @@ class OmniSoC_Base: self._mspec.workgroup_max_size = key continue - key = search(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", linetext) - if key != None: - self._mspec.chip_id = key - continue - key = search(r"^\s*Max Waves Per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: self._mspec.max_waves_per_cu = key @@ -182,18 +181,11 @@ class OmniSoC_Base: self._mspec.cur_sclk = self._mspec.max_sclk self._mspec.cur_mclk = self._mspec.max_mclk - self._mspec.gpu_series = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].keys())[ - 0 - ].upper() - # specify gpu name for gfx942 hardware - self._mspec.gpu_model = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].keys())[ - 0 - ].upper() - if self._mspec.gpu_model == "MI300": - # Use Chip ID to distinguish MI300 gpu model using the built-in dictionary - if self._mspec.chip_id in MI300_CHIP_IDS: - self._mspec.gpu_model = MI300_CHIP_IDS[self._mspec.chip_id] - + self._mspec.gpu_series = get_gpu_series(self._mspec.gpu_arch).upper() + # specify gpu model name for gfx942 hardware + self._mspec.gpu_model = get_gpu_model( + self._mspec.gpu_arch, self._mspec.gpu_chip_id + ).upper() self._mspec.num_xcd = str( total_xcds(self._mspec.gpu_model, self._mspec.compute_partition) ) @@ -593,7 +585,8 @@ def perfmon_coalesce( # TODO: more error checking if len(spatial_multiplexing) != 3: console_error( - "profiling", "multiplexing need provide node_idx node_count and gpu_count" + "profiling", + "multiplexing need provide node_idx node_count and gpu_count", ) node_idx = int(spatial_multiplexing[0]) diff --git a/src/utils/mi_gpu_spec.py b/src/utils/mi_gpu_spec.py new file mode 100644 index 0000000000..2682a585c8 --- /dev/null +++ b/src/utils/mi_gpu_spec.py @@ -0,0 +1,338 @@ +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Union + +import yaml + +# Constants for MI series +# NOTE: Currently supports MI50, MI100, MI200, MI300 +MI50 = 0 +MI100 = 1 +MI200 = 2 +MI300 = 3 + +MI_CONSTANS = {MI50: "mi50", MI100: "mi100", MI200: "mi200", MI300: "mi300"} + +gpu_series_dict = {} # key: gpu arch +gpu_model_dict = {} # key: gpu_arch +mi300_archs_dict = {} # key: gpu model +mi300_num_xcds_dict = {} # key: gpu model +mi300_nps_dict = {} # key: gpu model (NOTE: key can also be architecture) +mi300_chip_id_dict = {} # key: chip id (int) + + +# ---------------------------- +# Data Class handling to preserve the hierarchical gpu information +# ---------------------------- + + +@dataclass +class ComputePartitionMode: + """ + Represents the compute partition mode. + """ + + def __init__(self, num_xcds=None): + self.__num_xcds = num_xcds + + def get_num_xcds(self): + return self.__num_xcds + + +class Singleton(object): + _instances = {} + + def __new__(class_, *args, **kwargs): + if class_ not in class_._instances: + class_._instances[class_] = super(Singleton, class_).__new__( + class_, *args, **kwargs + ) + return class_._instances[class_] + + +@dataclass +class MIGPU(Singleton): + """ + Singleton class representing the detected MI GPU of current system. + Ensures only one instance exists. + """ + + _instance = None # Class variable to hold the single instance + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super(MIGPU, cls).__new__(cls) + cls._instance.mi_gpu_spec = [] # Initialize the instance attribute + return cls._instance + + def __init__( + self, + gpu_series, + gpu_arch, + gpu_model, + chip_id=None, + mi300_arch=None, + num_xcds=None, + ): + """ + gpu series, gpu_arch and gpu_model information must be available for a given MI GPU. + gpu series (str) + gpu_arch (str) + gpu_model (str) + """ + # gpu_series (str): The GPU series name (e.g., 'mi50', 'mi100', 'mi200', 'mi300') + self.gpu_series = gpu_series + self.gpu_arch = gpu_arch + self.gpu_model = gpu_model + self.chip_id = chip_id + self.mi300_arch = mi300_arch + self.compute_partition = ComputePartitionMode(num_xcds) + + self.is_mi300 = True if self.mi300_arch is not None else False + + def __post_init__(self): + if self.is_mi300: + # NOTE: currently, all mi300 series gpus shall have compute partition information + if self.compute_partition is None: + logging.warning( + "[MIGPU post init] mi300 gpu detected, but no num_xcd/compute partition data detected!!!" + ) + + def set_chip_id(self, chip_id): + self.chip_id = chip_id + + def set_mi300_arch(self, mi300_arch, num_xcds): + """ + All mi300 series gpus shall have compute partition information. + """ + if num_xcds is None: + logging.warning( + "[MIGPU post init] mi300 gpu detected, but no num_xcd/compute partition data detected!!!" + ) + + self.mi300_arch = mi300_arch + self.compute_partition = ComputePartitionMode(num_xcds) + + def get_gpu_series(self): + return self.gpu_series + + def get_gpu_arch(self): + return self.gpu_arch + + def get_gpu_model(self): + return self.gpu_model + + def get_chip_id(self): + return self.chip_id + + def get_mi300_arch(self): + return self.mi300_arch + + def get_compute_partition(self): + return self.compute_partition + + +# ---------------------------- +# YAML Parsing and Data Handling +# ---------------------------- + + +def load_yaml(file_path: str) -> Dict[str, Any]: + """ + Loads MI GPU YAML data /util into a Python dictionary. + + Args: + file_path (str): The path to the YAML file. + + Returns: + Dict[str, Any]: Parsed YAML data as a nested dictionary. + Exit with console error if an error occurs. + """ + logging.debug("[load_yaml]") + try: + with open(file_path, "r") as file: + data = yaml.safe_load(file) + return data + except FileNotFoundError: + logging.error(f"Error: The file '{file_path}' was not found.") + except yaml.YAMLError as exc: + logging.error(f"Error parsing YAML file '{file_path}': {exc}") + except Exception as e: + logging.error( + f"An unexpected error occurred while loading YAML file '{file_path}': {e}" + ) + + +def parse_mi_gpu_spec(): + """ + Parse out mi gpu data from yaml file and store in memory. + MI GPUs + |-- series + |-- architecture (list) + |-- models + |-- chip_ids + |-- mi300_arch + |-- partition_mode + """ + + current_dir = os.path.dirname(__file__) + yaml_file_path = os.path.join(current_dir, "mi_gpu_spec.yaml") + + # Load the YAML data + yaml_data = load_yaml(yaml_file_path) + mi300_models_dict = {} + + for mi_index, mi_series in MI_CONSTANS.items(): + if mi_series != MI_CONSTANS[MI300]: + logging.debug("[parse_mi_gpu_spec] Processing series: %s" % mi_series) + for key, value in yaml_data.items(): + # parse out gpu series and gpu model information for mi50, 100, 200 + curr_gpu_arch = value[mi_index]["gpu_archs"][0]["gpu_arch"] + gpu_series_dict[curr_gpu_arch] = mi_series + gpu_model_dict[curr_gpu_arch] = [] + for models in value[mi_index]["gpu_archs"][0]["models"]: + gpu_model_dict[curr_gpu_arch].append(models["gpu_model"]) + elif mi_series == MI_CONSTANS[MI300]: + # MI300 requires specific processing + for key, value in yaml_data.items(): + mi300_gpu_archs_list = [] + # NOTE: only MI300 have multiple architectures + for archs in value[MI300]["gpu_archs"]: + curr_gpu_arch = archs["gpu_arch"] + mi300_gpu_archs_list.append(curr_gpu_arch) + gpu_series_dict[curr_gpu_arch] = mi_series + + for idx, arch in enumerate(mi300_gpu_archs_list): + mi300_models_dict[arch] = [] + for models in value[MI300]["gpu_archs"][idx]["models"]: + gpu_model = models["gpu_model"] + + # NOTE: mi300 architecture is available for all mi300 gpu models + mi300_archs_dict[gpu_model] = models["mi300_arch"]["architecture"] + + # NOTE: compute partition mode num xcds is available for all mi300 gpu models + mi300_num_xcds_dict[gpu_model] = models["mi300_arch"][ + "partition_mode" + ]["compute_partition_mode"]["num_xcds"] + + # NOTE: memory partition mode nps is available for all mi300 gpu models + mi300_nps_dict[gpu_model] = models["mi300_arch"][ + "partition_mode" + ]["memory_partition_mode"] + + if not models["chip_ids"]["local"] is None: + # save chip_id, gpu_model pair if chip id is available + # NOTE: chip id is available for all gfx942 machines + mi300_chip_id_dict[models["chip_ids"]["local"]] = models[ + "gpu_model" + ] + mi300_models_dict[arch].append(gpu_model) + + gpu_model_dict.update(mi300_models_dict) + + +def get_gpu_series_dict(): + if not gpu_series_dict: + logging.error( + "gpu_series_dict not yet populated, did you run parse_mi_gpu_spec()?" + ) + return None + return gpu_series_dict + + +def get_gpu_series(gpu_arch_): + if not gpu_series_dict: + logging.error( + "gpu_series_dict not yet populated, did you run parse_mi_gpu_spec()?" + ) + return None + + # Normalize the key by checking both the raw and lowercase versions + gpu_series = gpu_series_dict.get(gpu_arch_) or gpu_series_dict.get(gpu_arch_.lower()) + if gpu_series: + return gpu_series + + logging.warning(f"No matching gpu series found for gpu arch: {gpu_arch_}") + return None + + +def get_gpu_model(gpu_arch_, chip_id_): + # Check that gpu_model_dict is populated first + if not gpu_model_dict: + logging.error( + "gpu_model_dict not yet populated. Did you run parse_mi_gpu_spec()?" + ) + return None + + gpu_arch_lower = gpu_arch_.lower() + + # Handle gfx942 with chip_id mapping + if gpu_arch_lower == "gfx942": + if chip_id_ and int(chip_id_) in mi300_chip_id_dict: + gpu_model = mi300_chip_id_dict.get(int(chip_id_)) + else: + logging.warning(f"No gpu model found for chip id: {chip_id_}") + return None + + # Otherwise use gpu_model_dict mapping for other mi architectures + elif gpu_arch_lower in gpu_model_dict: + # NOTE: take the first element works for now + gpu_model = gpu_model_dict[gpu_arch_lower][0] + else: + logging.warning(f"No gpu model found for gpu arch: {gpu_arch_lower}") + return None + + if not gpu_model: + logging.warning(f"No gpu model found for gpu arch: {gpu_arch_lower}") + return None + + return gpu_model + + +def get_mi300_archs_dict(): + if not mi300_archs_dict: + logging.error( + "mi300_archs_dict not yet populated, did you run parse_mi_gpu_spec()?" + ) + return None + return mi300_archs_dict + + +def get_mi300_num_xcds(gpu_model_, compute_partition_): + if not mi300_num_xcds_dict: + logging.error( + "mi300_num_xcds_dict not yet populated, did you run parse_mi_gpu_spec()?" + ) + return None + + gpu_model_lower = gpu_model_.lower() + partition_lower = compute_partition_.lower() + + if gpu_model_lower not in mi300_num_xcds_dict: + logging.info(f"Current system is not a mi300 system: {gpu_model_}") + return None + + model_dict = mi300_num_xcds_dict[gpu_model_lower] + if partition_lower not in model_dict: + logging.info(f"Unknown compute partition: {compute_partition_}") + return None + + num_xcds = model_dict[partition_lower] + if not num_xcds: + logging.warning( + "Unknown compute partition found for %s / %s", compute_partition_, gpu_model_ + ) + return None + + return num_xcds + + +def get_mi300_chip_id_dict(): + if mi300_chip_id_dict: + return mi300_chip_id_dict + else: + logging.error( + "mi300_chip_id_dict not yet populated, did you run parse_mi_gpu_spec()?" + ) diff --git a/src/utils/mi_gpu_spec.yaml b/src/utils/mi_gpu_spec.yaml new file mode 100644 index 0000000000..c481ffd963 --- /dev/null +++ b/src/utils/mi_gpu_spec.yaml @@ -0,0 +1,164 @@ +# -------------------------------------------------------------------------------- +# +# This yaml file tracks MI gpu spec in a tree structure. +# +# *It is important to note that this file only tracks the common specs for MI GPU series.* +# *For example, the CU #s are based on information retrieved from other tools.* +# ** +# +# MI GPUs +# |-- series: the specific MI series; mi50, mi100, mi200, mi300 +# |-- architecture: currently, only mi300 gpus hold different architectures +# |-- models +# |-- chip_ids: chip id is specific to the environment the gpu is being used on +# |-- mi300_arch: mi300 specific architectures; mi300a, mi300x +# |-- partition_mode: currently, only mi300 gpus hold partition mode information +# two types: compute partition mode, memory partition mode, +# currently only mi300 gpus contains compute partition mode information on number of xcds +# +# -------------------------------------------------------------------------------- + +mi_gpu_spec: + - gpu_series: mi50 + gpu_archs: + - gpu_arch: gfx906 + models: + - gpu_model: mi50 + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + - gpu_model: mi60 + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + + - gpu_series: mi100 + gpu_archs: + - gpu_arch: gfx908 + models: + - gpu_model: mi100 + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + + - gpu_series: mi200 + gpu_archs: + - gpu_arch: gfx90a + models: + - gpu_model: mi210 + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + - gpu_model: mi250 + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + - gpu_model: mi250x + mi300_arch: + architecture: null + partition_mode: null + chip_ids: + local: null + + - gpu_series: mi300 + gpu_archs: + - gpu_arch: gfx940 + models: + - gpu_model: mi300a_a0 + mi300_arch: + architecture: mi300a + partition_mode: + compute_partition_mode: + num_xcds: + spx: 6 + dpx: null + tpx: 2 + qpx: null + cpx: null + memory_partition_mode: + nps4: {tpx} + nps1: {spx, tpx} + chip_ids: + local: null + + - gpu_arch: gfx941 + models: + - gpu_model: mi300x_a0 + mi300_arch: + architecture: mi300x + partition_mode: + compute_partition_mode: + num_xcds: + spx: 8 + dpx: 4 + tpx: null + qpx: 2 + cpx: 1 + memory_partition_mode: + nps4: {qpx, cpx} + nps1: {spx, qpx, cpx} + chip_ids: + local: null + + - gpu_arch: gfx942 + models: + - gpu_model: mi300a_a1 + mi300_arch: + architecture: mi300a + partition_mode: + compute_partition_mode: + num_xcds: + spx: 6 + dpx: null + tpx: 2 + qpx: null + cpx: null + memory_partition_mode: + nps4: {tpx} + nps1: {spx, tpx} + chip_ids: + local: 29856 + + - gpu_model: mi300x_a1 + mi300_arch: + architecture: mi300x + partition_mode: + compute_partition_mode: + num_xcds: + spx: 8 + dpx: 4 + tpx: null + qpx: 2 + cpx: 1 + memory_partition_mode: + nps4: {qpx, cpx} + nps1: {spx, qpx, cpx} + chip_ids: + local: 29857 + + - gpu_model: mi308x + mi300_arch: + architecture: mi308x + partition_mode: + compute_partition_mode: + num_xcds: + spx: 4 + dpx: 2 + tpx: null + qpx: null + cpx: 1 + memory_partition_mode: + nps4: {cpx} + nps1: {spx, dpx, cpx} + chip_ids: + local: 29858 diff --git a/src/utils/specs.py b/src/utils/specs.py index 1a457e0aa0..f29e724f5a 100644 --- a/src/utils/specs.py +++ b/src/utils/specs.py @@ -29,6 +29,7 @@ import os import re import socket import subprocess +import sys from dataclasses import dataclass, field, fields from datetime import datetime from math import ceil @@ -37,6 +38,7 @@ from pathlib import Path as path import pandas as pd import config +from utils.mi_gpu_spec import get_gpu_series_dict, get_mi300_chip_id_dict from utils.tty import get_table_string from utils.utils import ( console_debug, @@ -60,21 +62,40 @@ VERSION_LOC = [ def detect_arch(_rocminfo): - from rocprof_compute_base import SUPPORTED_ARCHS - for idx1, linetext in enumerate(_rocminfo): - gpu_arch = search(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", linetext) - if gpu_arch in SUPPORTED_ARCHS.keys(): + # NOTE: currently supported socs are gfx archs only + gpu_arch = search(r"^\s*Name\s*:\s* ([Gg][Ff][Xx][a-zA-Z0-9]+).*\s*$", linetext) + if gpu_arch in get_gpu_series_dict().keys(): break - if str(gpu_arch) in SUPPORTED_ARCHS.keys(): + if str(gpu_arch) in get_gpu_series_dict().keys(): gpu_arch = str(gpu_arch) break - if not gpu_arch in SUPPORTED_ARCHS.keys(): - console_error("Cannot find a supported arch in rocminfo") + if not gpu_arch in get_gpu_series_dict().keys(): + console_error("Cannot find a supported arch in rocminfo: " + str(gpu_arch)) else: return (gpu_arch, idx1) +def detect_gpu_chip_id(_rocminfo): + for idx1, linetext in enumerate(_rocminfo): + # NOTE: current supported socs only have numbers in Chip ID + gpu_chip_id = search(r"^\s*Chip ID\s*:\s* ([0-9]+).*\s*$", linetext) + if gpu_chip_id and int(gpu_chip_id) in get_mi300_chip_id_dict().keys(): + gpu_chip_id = str(gpu_chip_id) + break + if str(gpu_chip_id) in get_mi300_chip_id_dict().keys(): + gpu_chip_id = str(gpu_chip_id) + break + if not gpu_chip_id: + console_warning("No Chip ID detected: " + str(gpu_chip_id)) + elif ( + gpu_chip_id not in get_mi300_chip_id_dict().keys() + and int(gpu_chip_id) not in get_mi300_chip_id_dict().keys() + ): + console_warning("Unknown Chip ID detected: " + str(gpu_chip_id)) + return gpu_chip_id + + # Custom decorator to mimic the behavior of kw_only found in Python 3.10 def kw_only(cls): def __init__(self, *args, **kwargs): @@ -163,6 +184,7 @@ def generate_machine_specs(args, sysinfo: dict = None): _rocminfo = rocminfo_full.split("\n") gpu_arch, idx = detect_arch(_rocminfo) _rocminfo = _rocminfo[idx + 1 :] # update rocminfo for target section + gpu_chip_id = detect_gpu_chip_id(_rocminfo) specs = MachineSpecs( version=specs_version, timestamp=timestamp, @@ -180,7 +202,9 @@ def generate_machine_specs(args, sysinfo: dict = None): compute_partition=compute_partition, memory_partition=memory_partition, gpu_arch=gpu_arch, + gpu_chip_id=gpu_chip_id, ) + # Load above SoC specs via module import try: soc_module = importlib.import_module("rocprof_compute_soc.soc_" + specs.gpu_arch) @@ -367,6 +391,14 @@ class MachineSpecs: "name": "GPU Arch", }, ) + gpu_chip_id: str = field( + default=None, + metadata={ + "doc": "The Chip ID of the accelerators/GPUs in the system.", + "name": "Chip ID", + "optional": True, + }, + ) gpu_l1: str = field( default=None, metadata={ @@ -420,13 +452,6 @@ class MachineSpecs: "name": "Workgroup Max Size", }, ) - chip_id: str = field( - default=None, - metadata={ - "doc": "The Chip ID of the accelerators/GPUs in the system.", - "name": "Chip ID", - }, - ) max_waves_per_cu: str = field( default=None, metadata={ @@ -661,11 +686,9 @@ def total_sqc(archname, numCUs, numSEs): def total_l2_banks(archname, L2Banks, compute_partition): - # Fixme: support all supported partitioning mode - # Fixme: "name" is a bad name! - totalL2Banks = L2Banks xcds = total_xcds(archname, compute_partition) - return L2Banks * xcds + totalL2Banks = L2Banks * xcds + return totalL2Banks if __name__ == "__main__": diff --git a/src/utils/utils.py b/src/utils/utils.py index 2cbe942cf9..c50c532a82 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -42,6 +42,7 @@ from pathlib import Path as path import pandas as pd import config +from utils.mi_gpu_spec import get_mi300_num_xcds rocprof_cmd = "" rocprof_args = "" @@ -686,7 +687,7 @@ def run_prof( if new_env and not using_v3(): # flatten tcc for applicable mi300 input f = path(workload_dir + "/out/pmc_1/results_" + fbase + ".csv") - xcds = total_xcds(mspec.gpu_model, mspec.compute_partition) + xcds = get_mi300_num_xcds(mspec.gpu_model, mspec.compute_partition) df = flatten_tcc_info_across_xcds(f, xcds, int(mspec._l2_banks)) df.to_csv(f, index=False) @@ -835,6 +836,7 @@ def replace_timestamps(workload_dir): def gen_sysinfo( workload_name, workload_dir, ip_blocks, app_cmd, skip_roof, roof_only, mspec, soc ): + console_debug("[gen_sysinfo]") df = mspec.get_class_members() # Append workload information to machine specs @@ -1051,47 +1053,58 @@ def flatten_tcc_info_across_xcds(file, xcds, tcc_channel_per_xcd): return df -def total_xcds(archname, compute_partition): +def total_xcds(gpu_model, compute_partition): + """ + Returns the number of xcds for a gpu model and compute_partition pair. + """ + + # For mi300 chips, return result from mi_gpu_spec + result = get_mi300_num_xcds(gpu_model, compute_partition) + if result: + return result + + # For other systems, use manual check # check MI300 has a valid compute partition - mi300a_archs = ["mi300a_a0", "mi300a_a1"] - mi300x_archs = ["mi300x_a0", "mi300x_a1"] - mi308x_archs = ["mi308x"] + mi300a_model = ["mi300a_a0", "mi300a_a1"] + mi300x_model = ["mi300x_a0", "mi300x_a1"] + mi308x_model = ["mi308x"] if ( - archname.lower() in mi300a_archs + mi300x_archs + mi308x_archs + gpu_model.lower() in mi300a_model + mi300x_model + mi308x_model and compute_partition == "NA" ): - console_error("Invalid compute partition found for {}".format(archname)) - if archname.lower() not in mi300a_archs + mi300x_archs + mi308x_archs: + console_error("Invalid compute partition found for {}".format(gpu_model)) + + if gpu_model.lower() not in mi300a_model + mi300x_model + mi308x_model: return 1 # from the whitepaper # https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf if compute_partition.lower() == "spx": - if archname.lower() in mi300a_archs: + if gpu_model.lower() in mi300a_model: return 6 - if archname.lower() in mi300x_archs: + if gpu_model.lower() in mi300x_model: return 8 - if archname.lower() in mi308x_archs: + if gpu_model.lower() in mi308x_model: return 4 if compute_partition.lower() == "tpx": - if archname.lower() in mi300a_archs: + if gpu_model.lower() in mi300a_model: return 2 if compute_partition.lower() == "dpx": - if archname.lower() in mi300x_archs: + if gpu_model.lower() in mi300x_model: return 4 - if archname.lower() in mi308x_archs: + if gpu_model.lower() in mi308x_model: return 2 if compute_partition.lower() == "qpx": - if archname.lower() in mi300x_archs: + if gpu_model.lower() in mi300x_model: return 2 if compute_partition.lower() == "cpx": - if archname.lower() in mi300x_archs: - return 2 - if archname.lower() in mi308x_archs: + if gpu_model.lower() in mi300x_model: + return 1 + if gpu_model.lower() in mi308x_model: return 1 # TODO implement other archs here as needed console_error( "Unknown compute partition / arch found for {} / {}".format( - compute_partition, archname + compute_partition, gpu_model ) )