Improve chip info logic. (#581)
* Clean up unused functions. * Fix number of XCDs for MI300X CPX (core partition). * Add support for memory partition mode. * Modify total_xcd to adapt to all gpu models. * Run black and isort. * Make gpu_arch regex more generic. * Add error checking for compute partition mode num xcds. * Set gpu_chip_id as optional. * Fix get_gpu_model. --------- Signed-off-by: xuchen-amd <xuchen@amd.com>
Этот коммит содержится в:
@@ -24,7 +24,6 @@
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import socket
|
||||
@@ -43,6 +42,10 @@ from utils.logger import (
|
||||
setup_file_handler,
|
||||
setup_logging_priority,
|
||||
)
|
||||
from utils.mi_gpu_spec import (
|
||||
get_gpu_series_dict,
|
||||
parse_mi_gpu_spec,
|
||||
)
|
||||
from utils.specs import MachineSpecs, generate_machine_specs
|
||||
from utils.utils import (
|
||||
console_debug,
|
||||
@@ -57,21 +60,6 @@ from utils.utils import (
|
||||
set_locale_encoding,
|
||||
)
|
||||
|
||||
SUPPORTED_ARCHS = {
|
||||
"gfx906": {"mi50": ["MI50", "MI60"]},
|
||||
"gfx908": {"mi100": ["MI100"]},
|
||||
"gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]},
|
||||
"gfx940": {"mi300": ["MI300A_A0"]},
|
||||
"gfx941": {"mi300": ["MI300X_A0"]},
|
||||
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
|
||||
}
|
||||
|
||||
MI300_CHIP_IDS = {
|
||||
"29856": "MI300A_A1",
|
||||
"29857": "MI300X_A1",
|
||||
"29858": "MI308X",
|
||||
}
|
||||
|
||||
|
||||
class RocProfCompute:
|
||||
def __init__(self):
|
||||
@@ -87,7 +75,8 @@ class RocProfCompute:
|
||||
"ver_pretty": None,
|
||||
}
|
||||
self.__options = {}
|
||||
self.__supported_archs = SUPPORTED_ARCHS
|
||||
parse_mi_gpu_spec()
|
||||
self.__supported_archs = get_gpu_series_dict()
|
||||
self.__mspec: MachineSpecs = None # to be initalized in load_soc_specs()
|
||||
setup_console_handler()
|
||||
self.set_version()
|
||||
|
||||
@@ -27,6 +27,7 @@ import math
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import OrderedDict
|
||||
@@ -36,7 +37,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
from rocprof_compute_base import MI300_CHIP_IDS, SUPPORTED_ARCHS
|
||||
from utils.mi_gpu_spec import get_gpu_model, get_gpu_series
|
||||
from utils.parser import build_in_vars, supported_denom
|
||||
from utils.utils import (
|
||||
console_debug,
|
||||
@@ -45,6 +46,7 @@ from utils.utils import (
|
||||
convert_metric_id_to_panel_idx,
|
||||
demarcate,
|
||||
get_default_accumulate_counter_file_ymal,
|
||||
total_xcds,
|
||||
using_v3,
|
||||
)
|
||||
|
||||
@@ -53,6 +55,7 @@ class OmniSoC_Base:
|
||||
def __init__(
|
||||
self, args, mspec
|
||||
): # new info field will contain rocminfo or sysinfo to populate properties
|
||||
console_debug("[omnisoc init]")
|
||||
self.__args = args
|
||||
self.__arch = None
|
||||
self._mspec = mspec
|
||||
@@ -102,7 +105,8 @@ class OmniSoC_Base:
|
||||
|
||||
@demarcate
|
||||
def populate_mspec(self):
|
||||
from utils.specs import run, search, total_sqc, total_xcds
|
||||
console_debug("[populate_mspec]")
|
||||
from utils.specs import run, search, total_sqc
|
||||
|
||||
if not hasattr(self._mspec, "_rocminfo") or self._mspec._rocminfo is None:
|
||||
return
|
||||
@@ -151,11 +155,6 @@ class OmniSoC_Base:
|
||||
self._mspec.workgroup_max_size = key
|
||||
continue
|
||||
|
||||
key = search(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", linetext)
|
||||
if key != None:
|
||||
self._mspec.chip_id = key
|
||||
continue
|
||||
|
||||
key = search(r"^\s*Max Waves Per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext)
|
||||
if key != None:
|
||||
self._mspec.max_waves_per_cu = key
|
||||
@@ -182,18 +181,11 @@ class OmniSoC_Base:
|
||||
self._mspec.cur_sclk = self._mspec.max_sclk
|
||||
self._mspec.cur_mclk = self._mspec.max_mclk
|
||||
|
||||
self._mspec.gpu_series = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].keys())[
|
||||
0
|
||||
].upper()
|
||||
# specify gpu name for gfx942 hardware
|
||||
self._mspec.gpu_model = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].keys())[
|
||||
0
|
||||
].upper()
|
||||
if self._mspec.gpu_model == "MI300":
|
||||
# Use Chip ID to distinguish MI300 gpu model using the built-in dictionary
|
||||
if self._mspec.chip_id in MI300_CHIP_IDS:
|
||||
self._mspec.gpu_model = MI300_CHIP_IDS[self._mspec.chip_id]
|
||||
|
||||
self._mspec.gpu_series = get_gpu_series(self._mspec.gpu_arch).upper()
|
||||
# specify gpu model name for gfx942 hardware
|
||||
self._mspec.gpu_model = get_gpu_model(
|
||||
self._mspec.gpu_arch, self._mspec.gpu_chip_id
|
||||
).upper()
|
||||
self._mspec.num_xcd = str(
|
||||
total_xcds(self._mspec.gpu_model, self._mspec.compute_partition)
|
||||
)
|
||||
@@ -593,7 +585,8 @@ def perfmon_coalesce(
|
||||
# TODO: more error checking
|
||||
if len(spatial_multiplexing) != 3:
|
||||
console_error(
|
||||
"profiling", "multiplexing need provide node_idx node_count and gpu_count"
|
||||
"profiling",
|
||||
"multiplexing need provide node_idx node_count and gpu_count",
|
||||
)
|
||||
|
||||
node_idx = int(spatial_multiplexing[0])
|
||||
|
||||
@@ -0,0 +1,338 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import yaml
|
||||
|
||||
# Constants for MI series
|
||||
# NOTE: Currently supports MI50, MI100, MI200, MI300
|
||||
MI50 = 0
|
||||
MI100 = 1
|
||||
MI200 = 2
|
||||
MI300 = 3
|
||||
|
||||
MI_CONSTANS = {MI50: "mi50", MI100: "mi100", MI200: "mi200", MI300: "mi300"}
|
||||
|
||||
gpu_series_dict = {} # key: gpu arch
|
||||
gpu_model_dict = {} # key: gpu_arch
|
||||
mi300_archs_dict = {} # key: gpu model
|
||||
mi300_num_xcds_dict = {} # key: gpu model
|
||||
mi300_nps_dict = {} # key: gpu model (NOTE: key can also be architecture)
|
||||
mi300_chip_id_dict = {} # key: chip id (int)
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# Data Class handling to preserve the hierarchical gpu information
|
||||
# ----------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComputePartitionMode:
|
||||
"""
|
||||
Represents the compute partition mode.
|
||||
"""
|
||||
|
||||
def __init__(self, num_xcds=None):
|
||||
self.__num_xcds = num_xcds
|
||||
|
||||
def get_num_xcds(self):
|
||||
return self.__num_xcds
|
||||
|
||||
|
||||
class Singleton(object):
|
||||
_instances = {}
|
||||
|
||||
def __new__(class_, *args, **kwargs):
|
||||
if class_ not in class_._instances:
|
||||
class_._instances[class_] = super(Singleton, class_).__new__(
|
||||
class_, *args, **kwargs
|
||||
)
|
||||
return class_._instances[class_]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MIGPU(Singleton):
|
||||
"""
|
||||
Singleton class representing the detected MI GPU of current system.
|
||||
Ensures only one instance exists.
|
||||
"""
|
||||
|
||||
_instance = None # Class variable to hold the single instance
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(MIGPU, cls).__new__(cls)
|
||||
cls._instance.mi_gpu_spec = [] # Initialize the instance attribute
|
||||
return cls._instance
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gpu_series,
|
||||
gpu_arch,
|
||||
gpu_model,
|
||||
chip_id=None,
|
||||
mi300_arch=None,
|
||||
num_xcds=None,
|
||||
):
|
||||
"""
|
||||
gpu series, gpu_arch and gpu_model information must be available for a given MI GPU.
|
||||
gpu series (str)
|
||||
gpu_arch (str)
|
||||
gpu_model (str)
|
||||
"""
|
||||
# gpu_series (str): The GPU series name (e.g., 'mi50', 'mi100', 'mi200', 'mi300')
|
||||
self.gpu_series = gpu_series
|
||||
self.gpu_arch = gpu_arch
|
||||
self.gpu_model = gpu_model
|
||||
self.chip_id = chip_id
|
||||
self.mi300_arch = mi300_arch
|
||||
self.compute_partition = ComputePartitionMode(num_xcds)
|
||||
|
||||
self.is_mi300 = True if self.mi300_arch is not None else False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.is_mi300:
|
||||
# NOTE: currently, all mi300 series gpus shall have compute partition information
|
||||
if self.compute_partition is None:
|
||||
logging.warning(
|
||||
"[MIGPU post init] mi300 gpu detected, but no num_xcd/compute partition data detected!!!"
|
||||
)
|
||||
|
||||
def set_chip_id(self, chip_id):
|
||||
self.chip_id = chip_id
|
||||
|
||||
def set_mi300_arch(self, mi300_arch, num_xcds):
|
||||
"""
|
||||
All mi300 series gpus shall have compute partition information.
|
||||
"""
|
||||
if num_xcds is None:
|
||||
logging.warning(
|
||||
"[MIGPU post init] mi300 gpu detected, but no num_xcd/compute partition data detected!!!"
|
||||
)
|
||||
|
||||
self.mi300_arch = mi300_arch
|
||||
self.compute_partition = ComputePartitionMode(num_xcds)
|
||||
|
||||
def get_gpu_series(self):
|
||||
return self.gpu_series
|
||||
|
||||
def get_gpu_arch(self):
|
||||
return self.gpu_arch
|
||||
|
||||
def get_gpu_model(self):
|
||||
return self.gpu_model
|
||||
|
||||
def get_chip_id(self):
|
||||
return self.chip_id
|
||||
|
||||
def get_mi300_arch(self):
|
||||
return self.mi300_arch
|
||||
|
||||
def get_compute_partition(self):
|
||||
return self.compute_partition
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# YAML Parsing and Data Handling
|
||||
# ----------------------------
|
||||
|
||||
|
||||
def load_yaml(file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Loads MI GPU YAML data /util into a Python dictionary.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the YAML file.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Parsed YAML data as a nested dictionary.
|
||||
Exit with console error if an error occurs.
|
||||
"""
|
||||
logging.debug("[load_yaml]")
|
||||
try:
|
||||
with open(file_path, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
return data
|
||||
except FileNotFoundError:
|
||||
logging.error(f"Error: The file '{file_path}' was not found.")
|
||||
except yaml.YAMLError as exc:
|
||||
logging.error(f"Error parsing YAML file '{file_path}': {exc}")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"An unexpected error occurred while loading YAML file '{file_path}': {e}"
|
||||
)
|
||||
|
||||
|
||||
def parse_mi_gpu_spec():
|
||||
"""
|
||||
Parse out mi gpu data from yaml file and store in memory.
|
||||
MI GPUs
|
||||
|-- series
|
||||
|-- architecture (list)
|
||||
|-- models
|
||||
|-- chip_ids
|
||||
|-- mi300_arch
|
||||
|-- partition_mode
|
||||
"""
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
yaml_file_path = os.path.join(current_dir, "mi_gpu_spec.yaml")
|
||||
|
||||
# Load the YAML data
|
||||
yaml_data = load_yaml(yaml_file_path)
|
||||
mi300_models_dict = {}
|
||||
|
||||
for mi_index, mi_series in MI_CONSTANS.items():
|
||||
if mi_series != MI_CONSTANS[MI300]:
|
||||
logging.debug("[parse_mi_gpu_spec] Processing series: %s" % mi_series)
|
||||
for key, value in yaml_data.items():
|
||||
# parse out gpu series and gpu model information for mi50, 100, 200
|
||||
curr_gpu_arch = value[mi_index]["gpu_archs"][0]["gpu_arch"]
|
||||
gpu_series_dict[curr_gpu_arch] = mi_series
|
||||
gpu_model_dict[curr_gpu_arch] = []
|
||||
for models in value[mi_index]["gpu_archs"][0]["models"]:
|
||||
gpu_model_dict[curr_gpu_arch].append(models["gpu_model"])
|
||||
elif mi_series == MI_CONSTANS[MI300]:
|
||||
# MI300 requires specific processing
|
||||
for key, value in yaml_data.items():
|
||||
mi300_gpu_archs_list = []
|
||||
# NOTE: only MI300 have multiple architectures
|
||||
for archs in value[MI300]["gpu_archs"]:
|
||||
curr_gpu_arch = archs["gpu_arch"]
|
||||
mi300_gpu_archs_list.append(curr_gpu_arch)
|
||||
gpu_series_dict[curr_gpu_arch] = mi_series
|
||||
|
||||
for idx, arch in enumerate(mi300_gpu_archs_list):
|
||||
mi300_models_dict[arch] = []
|
||||
for models in value[MI300]["gpu_archs"][idx]["models"]:
|
||||
gpu_model = models["gpu_model"]
|
||||
|
||||
# NOTE: mi300 architecture is available for all mi300 gpu models
|
||||
mi300_archs_dict[gpu_model] = models["mi300_arch"]["architecture"]
|
||||
|
||||
# NOTE: compute partition mode num xcds is available for all mi300 gpu models
|
||||
mi300_num_xcds_dict[gpu_model] = models["mi300_arch"][
|
||||
"partition_mode"
|
||||
]["compute_partition_mode"]["num_xcds"]
|
||||
|
||||
# NOTE: memory partition mode nps is available for all mi300 gpu models
|
||||
mi300_nps_dict[gpu_model] = models["mi300_arch"][
|
||||
"partition_mode"
|
||||
]["memory_partition_mode"]
|
||||
|
||||
if not models["chip_ids"]["local"] is None:
|
||||
# save chip_id, gpu_model pair if chip id is available
|
||||
# NOTE: chip id is available for all gfx942 machines
|
||||
mi300_chip_id_dict[models["chip_ids"]["local"]] = models[
|
||||
"gpu_model"
|
||||
]
|
||||
mi300_models_dict[arch].append(gpu_model)
|
||||
|
||||
gpu_model_dict.update(mi300_models_dict)
|
||||
|
||||
|
||||
def get_gpu_series_dict():
|
||||
if not gpu_series_dict:
|
||||
logging.error(
|
||||
"gpu_series_dict not yet populated, did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
return None
|
||||
return gpu_series_dict
|
||||
|
||||
|
||||
def get_gpu_series(gpu_arch_):
|
||||
if not gpu_series_dict:
|
||||
logging.error(
|
||||
"gpu_series_dict not yet populated, did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
return None
|
||||
|
||||
# Normalize the key by checking both the raw and lowercase versions
|
||||
gpu_series = gpu_series_dict.get(gpu_arch_) or gpu_series_dict.get(gpu_arch_.lower())
|
||||
if gpu_series:
|
||||
return gpu_series
|
||||
|
||||
logging.warning(f"No matching gpu series found for gpu arch: {gpu_arch_}")
|
||||
return None
|
||||
|
||||
|
||||
def get_gpu_model(gpu_arch_, chip_id_):
|
||||
# Check that gpu_model_dict is populated first
|
||||
if not gpu_model_dict:
|
||||
logging.error(
|
||||
"gpu_model_dict not yet populated. Did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
return None
|
||||
|
||||
gpu_arch_lower = gpu_arch_.lower()
|
||||
|
||||
# Handle gfx942 with chip_id mapping
|
||||
if gpu_arch_lower == "gfx942":
|
||||
if chip_id_ and int(chip_id_) in mi300_chip_id_dict:
|
||||
gpu_model = mi300_chip_id_dict.get(int(chip_id_))
|
||||
else:
|
||||
logging.warning(f"No gpu model found for chip id: {chip_id_}")
|
||||
return None
|
||||
|
||||
# Otherwise use gpu_model_dict mapping for other mi architectures
|
||||
elif gpu_arch_lower in gpu_model_dict:
|
||||
# NOTE: take the first element works for now
|
||||
gpu_model = gpu_model_dict[gpu_arch_lower][0]
|
||||
else:
|
||||
logging.warning(f"No gpu model found for gpu arch: {gpu_arch_lower}")
|
||||
return None
|
||||
|
||||
if not gpu_model:
|
||||
logging.warning(f"No gpu model found for gpu arch: {gpu_arch_lower}")
|
||||
return None
|
||||
|
||||
return gpu_model
|
||||
|
||||
|
||||
def get_mi300_archs_dict():
|
||||
if not mi300_archs_dict:
|
||||
logging.error(
|
||||
"mi300_archs_dict not yet populated, did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
return None
|
||||
return mi300_archs_dict
|
||||
|
||||
|
||||
def get_mi300_num_xcds(gpu_model_, compute_partition_):
|
||||
if not mi300_num_xcds_dict:
|
||||
logging.error(
|
||||
"mi300_num_xcds_dict not yet populated, did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
return None
|
||||
|
||||
gpu_model_lower = gpu_model_.lower()
|
||||
partition_lower = compute_partition_.lower()
|
||||
|
||||
if gpu_model_lower not in mi300_num_xcds_dict:
|
||||
logging.info(f"Current system is not a mi300 system: {gpu_model_}")
|
||||
return None
|
||||
|
||||
model_dict = mi300_num_xcds_dict[gpu_model_lower]
|
||||
if partition_lower not in model_dict:
|
||||
logging.info(f"Unknown compute partition: {compute_partition_}")
|
||||
return None
|
||||
|
||||
num_xcds = model_dict[partition_lower]
|
||||
if not num_xcds:
|
||||
logging.warning(
|
||||
"Unknown compute partition found for %s / %s", compute_partition_, gpu_model_
|
||||
)
|
||||
return None
|
||||
|
||||
return num_xcds
|
||||
|
||||
|
||||
def get_mi300_chip_id_dict():
|
||||
if mi300_chip_id_dict:
|
||||
return mi300_chip_id_dict
|
||||
else:
|
||||
logging.error(
|
||||
"mi300_chip_id_dict not yet populated, did you run parse_mi_gpu_spec()?"
|
||||
)
|
||||
@@ -0,0 +1,164 @@
|
||||
# --------------------------------------------------------------------------------
|
||||
#
|
||||
# This yaml file tracks MI gpu spec in a tree structure.
|
||||
#
|
||||
# *It is important to note that this file only tracks the common specs for MI GPU series.*
|
||||
# *For example, the CU #s are based on information retrieved from other tools.*
|
||||
# **
|
||||
#
|
||||
# MI GPUs
|
||||
# |-- series: the specific MI series; mi50, mi100, mi200, mi300
|
||||
# |-- architecture: currently, only mi300 gpus hold different architectures
|
||||
# |-- models
|
||||
# |-- chip_ids: chip id is specific to the environment the gpu is being used on
|
||||
# |-- mi300_arch: mi300 specific architectures; mi300a, mi300x
|
||||
# |-- partition_mode: currently, only mi300 gpus hold partition mode information
|
||||
# two types: compute partition mode, memory partition mode,
|
||||
# currently only mi300 gpus contains compute partition mode information on number of xcds
|
||||
#
|
||||
# --------------------------------------------------------------------------------
|
||||
|
||||
mi_gpu_spec:
|
||||
- gpu_series: mi50
|
||||
gpu_archs:
|
||||
- gpu_arch: gfx906
|
||||
models:
|
||||
- gpu_model: mi50
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
- gpu_model: mi60
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
|
||||
- gpu_series: mi100
|
||||
gpu_archs:
|
||||
- gpu_arch: gfx908
|
||||
models:
|
||||
- gpu_model: mi100
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
|
||||
- gpu_series: mi200
|
||||
gpu_archs:
|
||||
- gpu_arch: gfx90a
|
||||
models:
|
||||
- gpu_model: mi210
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
- gpu_model: mi250
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
- gpu_model: mi250x
|
||||
mi300_arch:
|
||||
architecture: null
|
||||
partition_mode: null
|
||||
chip_ids:
|
||||
local: null
|
||||
|
||||
- gpu_series: mi300
|
||||
gpu_archs:
|
||||
- gpu_arch: gfx940
|
||||
models:
|
||||
- gpu_model: mi300a_a0
|
||||
mi300_arch:
|
||||
architecture: mi300a
|
||||
partition_mode:
|
||||
compute_partition_mode:
|
||||
num_xcds:
|
||||
spx: 6
|
||||
dpx: null
|
||||
tpx: 2
|
||||
qpx: null
|
||||
cpx: null
|
||||
memory_partition_mode:
|
||||
nps4: {tpx}
|
||||
nps1: {spx, tpx}
|
||||
chip_ids:
|
||||
local: null
|
||||
|
||||
- gpu_arch: gfx941
|
||||
models:
|
||||
- gpu_model: mi300x_a0
|
||||
mi300_arch:
|
||||
architecture: mi300x
|
||||
partition_mode:
|
||||
compute_partition_mode:
|
||||
num_xcds:
|
||||
spx: 8
|
||||
dpx: 4
|
||||
tpx: null
|
||||
qpx: 2
|
||||
cpx: 1
|
||||
memory_partition_mode:
|
||||
nps4: {qpx, cpx}
|
||||
nps1: {spx, qpx, cpx}
|
||||
chip_ids:
|
||||
local: null
|
||||
|
||||
- gpu_arch: gfx942
|
||||
models:
|
||||
- gpu_model: mi300a_a1
|
||||
mi300_arch:
|
||||
architecture: mi300a
|
||||
partition_mode:
|
||||
compute_partition_mode:
|
||||
num_xcds:
|
||||
spx: 6
|
||||
dpx: null
|
||||
tpx: 2
|
||||
qpx: null
|
||||
cpx: null
|
||||
memory_partition_mode:
|
||||
nps4: {tpx}
|
||||
nps1: {spx, tpx}
|
||||
chip_ids:
|
||||
local: 29856
|
||||
|
||||
- gpu_model: mi300x_a1
|
||||
mi300_arch:
|
||||
architecture: mi300x
|
||||
partition_mode:
|
||||
compute_partition_mode:
|
||||
num_xcds:
|
||||
spx: 8
|
||||
dpx: 4
|
||||
tpx: null
|
||||
qpx: 2
|
||||
cpx: 1
|
||||
memory_partition_mode:
|
||||
nps4: {qpx, cpx}
|
||||
nps1: {spx, qpx, cpx}
|
||||
chip_ids:
|
||||
local: 29857
|
||||
|
||||
- gpu_model: mi308x
|
||||
mi300_arch:
|
||||
architecture: mi308x
|
||||
partition_mode:
|
||||
compute_partition_mode:
|
||||
num_xcds:
|
||||
spx: 4
|
||||
dpx: 2
|
||||
tpx: null
|
||||
qpx: null
|
||||
cpx: 1
|
||||
memory_partition_mode:
|
||||
nps4: {cpx}
|
||||
nps1: {spx, dpx, cpx}
|
||||
chip_ids:
|
||||
local: 29858
|
||||
+41
-18
@@ -29,6 +29,7 @@ import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass, field, fields
|
||||
from datetime import datetime
|
||||
from math import ceil
|
||||
@@ -37,6 +38,7 @@ from pathlib import Path as path
|
||||
import pandas as pd
|
||||
|
||||
import config
|
||||
from utils.mi_gpu_spec import get_gpu_series_dict, get_mi300_chip_id_dict
|
||||
from utils.tty import get_table_string
|
||||
from utils.utils import (
|
||||
console_debug,
|
||||
@@ -60,21 +62,40 @@ VERSION_LOC = [
|
||||
|
||||
|
||||
def detect_arch(_rocminfo):
|
||||
from rocprof_compute_base import SUPPORTED_ARCHS
|
||||
|
||||
for idx1, linetext in enumerate(_rocminfo):
|
||||
gpu_arch = search(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", linetext)
|
||||
if gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
# NOTE: currently supported socs are gfx archs only
|
||||
gpu_arch = search(r"^\s*Name\s*:\s* ([Gg][Ff][Xx][a-zA-Z0-9]+).*\s*$", linetext)
|
||||
if gpu_arch in get_gpu_series_dict().keys():
|
||||
break
|
||||
if str(gpu_arch) in SUPPORTED_ARCHS.keys():
|
||||
if str(gpu_arch) in get_gpu_series_dict().keys():
|
||||
gpu_arch = str(gpu_arch)
|
||||
break
|
||||
if not gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
console_error("Cannot find a supported arch in rocminfo")
|
||||
if not gpu_arch in get_gpu_series_dict().keys():
|
||||
console_error("Cannot find a supported arch in rocminfo: " + str(gpu_arch))
|
||||
else:
|
||||
return (gpu_arch, idx1)
|
||||
|
||||
|
||||
def detect_gpu_chip_id(_rocminfo):
|
||||
for idx1, linetext in enumerate(_rocminfo):
|
||||
# NOTE: current supported socs only have numbers in Chip ID
|
||||
gpu_chip_id = search(r"^\s*Chip ID\s*:\s* ([0-9]+).*\s*$", linetext)
|
||||
if gpu_chip_id and int(gpu_chip_id) in get_mi300_chip_id_dict().keys():
|
||||
gpu_chip_id = str(gpu_chip_id)
|
||||
break
|
||||
if str(gpu_chip_id) in get_mi300_chip_id_dict().keys():
|
||||
gpu_chip_id = str(gpu_chip_id)
|
||||
break
|
||||
if not gpu_chip_id:
|
||||
console_warning("No Chip ID detected: " + str(gpu_chip_id))
|
||||
elif (
|
||||
gpu_chip_id not in get_mi300_chip_id_dict().keys()
|
||||
and int(gpu_chip_id) not in get_mi300_chip_id_dict().keys()
|
||||
):
|
||||
console_warning("Unknown Chip ID detected: " + str(gpu_chip_id))
|
||||
return gpu_chip_id
|
||||
|
||||
|
||||
# Custom decorator to mimic the behavior of kw_only found in Python 3.10
|
||||
def kw_only(cls):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -163,6 +184,7 @@ def generate_machine_specs(args, sysinfo: dict = None):
|
||||
_rocminfo = rocminfo_full.split("\n")
|
||||
gpu_arch, idx = detect_arch(_rocminfo)
|
||||
_rocminfo = _rocminfo[idx + 1 :] # update rocminfo for target section
|
||||
gpu_chip_id = detect_gpu_chip_id(_rocminfo)
|
||||
specs = MachineSpecs(
|
||||
version=specs_version,
|
||||
timestamp=timestamp,
|
||||
@@ -180,7 +202,9 @@ def generate_machine_specs(args, sysinfo: dict = None):
|
||||
compute_partition=compute_partition,
|
||||
memory_partition=memory_partition,
|
||||
gpu_arch=gpu_arch,
|
||||
gpu_chip_id=gpu_chip_id,
|
||||
)
|
||||
|
||||
# Load above SoC specs via module import
|
||||
try:
|
||||
soc_module = importlib.import_module("rocprof_compute_soc.soc_" + specs.gpu_arch)
|
||||
@@ -367,6 +391,14 @@ class MachineSpecs:
|
||||
"name": "GPU Arch",
|
||||
},
|
||||
)
|
||||
gpu_chip_id: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"doc": "The Chip ID of the accelerators/GPUs in the system.",
|
||||
"name": "Chip ID",
|
||||
"optional": True,
|
||||
},
|
||||
)
|
||||
gpu_l1: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
@@ -420,13 +452,6 @@ class MachineSpecs:
|
||||
"name": "Workgroup Max Size",
|
||||
},
|
||||
)
|
||||
chip_id: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"doc": "The Chip ID of the accelerators/GPUs in the system.",
|
||||
"name": "Chip ID",
|
||||
},
|
||||
)
|
||||
max_waves_per_cu: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
@@ -661,11 +686,9 @@ def total_sqc(archname, numCUs, numSEs):
|
||||
|
||||
|
||||
def total_l2_banks(archname, L2Banks, compute_partition):
|
||||
# Fixme: support all supported partitioning mode
|
||||
# Fixme: "name" is a bad name!
|
||||
totalL2Banks = L2Banks
|
||||
xcds = total_xcds(archname, compute_partition)
|
||||
return L2Banks * xcds
|
||||
totalL2Banks = L2Banks * xcds
|
||||
return totalL2Banks
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+32
-19
@@ -42,6 +42,7 @@ from pathlib import Path as path
|
||||
import pandas as pd
|
||||
|
||||
import config
|
||||
from utils.mi_gpu_spec import get_mi300_num_xcds
|
||||
|
||||
rocprof_cmd = ""
|
||||
rocprof_args = ""
|
||||
@@ -686,7 +687,7 @@ def run_prof(
|
||||
if new_env and not using_v3():
|
||||
# flatten tcc for applicable mi300 input
|
||||
f = path(workload_dir + "/out/pmc_1/results_" + fbase + ".csv")
|
||||
xcds = total_xcds(mspec.gpu_model, mspec.compute_partition)
|
||||
xcds = get_mi300_num_xcds(mspec.gpu_model, mspec.compute_partition)
|
||||
df = flatten_tcc_info_across_xcds(f, xcds, int(mspec._l2_banks))
|
||||
df.to_csv(f, index=False)
|
||||
|
||||
@@ -835,6 +836,7 @@ def replace_timestamps(workload_dir):
|
||||
def gen_sysinfo(
|
||||
workload_name, workload_dir, ip_blocks, app_cmd, skip_roof, roof_only, mspec, soc
|
||||
):
|
||||
console_debug("[gen_sysinfo]")
|
||||
df = mspec.get_class_members()
|
||||
|
||||
# Append workload information to machine specs
|
||||
@@ -1051,47 +1053,58 @@ def flatten_tcc_info_across_xcds(file, xcds, tcc_channel_per_xcd):
|
||||
return df
|
||||
|
||||
|
||||
def total_xcds(archname, compute_partition):
|
||||
def total_xcds(gpu_model, compute_partition):
|
||||
"""
|
||||
Returns the number of xcds for a gpu model and compute_partition pair.
|
||||
"""
|
||||
|
||||
# For mi300 chips, return result from mi_gpu_spec
|
||||
result = get_mi300_num_xcds(gpu_model, compute_partition)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# For other systems, use manual check
|
||||
# check MI300 has a valid compute partition
|
||||
mi300a_archs = ["mi300a_a0", "mi300a_a1"]
|
||||
mi300x_archs = ["mi300x_a0", "mi300x_a1"]
|
||||
mi308x_archs = ["mi308x"]
|
||||
mi300a_model = ["mi300a_a0", "mi300a_a1"]
|
||||
mi300x_model = ["mi300x_a0", "mi300x_a1"]
|
||||
mi308x_model = ["mi308x"]
|
||||
if (
|
||||
archname.lower() in mi300a_archs + mi300x_archs + mi308x_archs
|
||||
gpu_model.lower() in mi300a_model + mi300x_model + mi308x_model
|
||||
and compute_partition == "NA"
|
||||
):
|
||||
console_error("Invalid compute partition found for {}".format(archname))
|
||||
if archname.lower() not in mi300a_archs + mi300x_archs + mi308x_archs:
|
||||
console_error("Invalid compute partition found for {}".format(gpu_model))
|
||||
|
||||
if gpu_model.lower() not in mi300a_model + mi300x_model + mi308x_model:
|
||||
return 1
|
||||
# from the whitepaper
|
||||
# https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
|
||||
if compute_partition.lower() == "spx":
|
||||
if archname.lower() in mi300a_archs:
|
||||
if gpu_model.lower() in mi300a_model:
|
||||
return 6
|
||||
if archname.lower() in mi300x_archs:
|
||||
if gpu_model.lower() in mi300x_model:
|
||||
return 8
|
||||
if archname.lower() in mi308x_archs:
|
||||
if gpu_model.lower() in mi308x_model:
|
||||
return 4
|
||||
if compute_partition.lower() == "tpx":
|
||||
if archname.lower() in mi300a_archs:
|
||||
if gpu_model.lower() in mi300a_model:
|
||||
return 2
|
||||
if compute_partition.lower() == "dpx":
|
||||
if archname.lower() in mi300x_archs:
|
||||
if gpu_model.lower() in mi300x_model:
|
||||
return 4
|
||||
if archname.lower() in mi308x_archs:
|
||||
if gpu_model.lower() in mi308x_model:
|
||||
return 2
|
||||
if compute_partition.lower() == "qpx":
|
||||
if archname.lower() in mi300x_archs:
|
||||
if gpu_model.lower() in mi300x_model:
|
||||
return 2
|
||||
if compute_partition.lower() == "cpx":
|
||||
if archname.lower() in mi300x_archs:
|
||||
return 2
|
||||
if archname.lower() in mi308x_archs:
|
||||
if gpu_model.lower() in mi300x_model:
|
||||
return 1
|
||||
if gpu_model.lower() in mi308x_model:
|
||||
return 1
|
||||
# TODO implement other archs here as needed
|
||||
console_error(
|
||||
"Unknown compute partition / arch found for {} / {}".format(
|
||||
compute_partition, archname
|
||||
compute_partition, gpu_model
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user