Files
rocm-systems/src/utils/specs.py
T
Karl W Schulz ca59cbe3a3 apply formatter
Signed-off-by: Karl W Schulz <karl.schulz@amd.com>
2024-03-02 12:28:34 -06:00

668 строки
23 KiB
Python

"""Get host/gpu specs."""
##############################################################################bl
# MIT License
#
# Copyright (c) 2021 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
##############################################################################el
import os
import re
import sys
import socket
import subprocess
import importlib
import logging
import config
import pandas as pd
from datetime import datetime
from math import ceil
from dataclasses import dataclass, field, asdict, fields
from pathlib import Path as path
from textwrap import dedent
from utils.utils import error, get_hbm_stack_num, get_version
from utils.tty import get_table_string
VERSION_LOC = [
"version",
"version-dev",
"version-hip-libraries",
"version-hiprt",
"version-hiprt-devel",
"version-hip-sdk",
"version-libs",
"version-utils",
]
def detect_arch(_rocminfo):
from omniperf_base import SUPPORTED_ARCHS
for idx1, linetext in enumerate(_rocminfo):
gpu_arch = search(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", linetext)
if gpu_arch in SUPPORTED_ARCHS.keys():
break
if str(gpu_arch) in SUPPORTED_ARCHS.keys():
gpu_arch = str(gpu_arch)
break
if not gpu_arch in SUPPORTED_ARCHS.keys():
error("[profiling] Cannot find a supported arch in rocminfo")
else:
return (gpu_arch, idx1)
# Custom decorator to mimic the behavior of kw_only found in Python 3.10
def kw_only(cls):
def __init__(self, *args, **kwargs):
for name, value in kwargs.items():
setattr(self, name, value)
cls.__init__ = __init__
return cls
def generate_machine_specs(args, sysinfo: dict = None):
if not sysinfo is None:
sysinfo_ver = str(sysinfo["version"])
version = get_version(config.omniperf_home)["version"]
if sysinfo_ver != version[: version.find(".")]:
logging.warning(
"WARNING: Detected mismatch in sysinfo versioning. You may need to reprofile to update data."
)
return MachineSpecs(**sysinfo)
# read timestamp info
now = datetime.now()
local_now = now.astimezone()
local_tz = local_now.tzinfo
local_tzname = local_tz.tzname(local_now)
timestamp = now.strftime("%c") + " (" + local_tzname + ")"
hostname = socket.gethostname()
# set specs version
vData = get_version(config.omniperf_home)
version = vData["version"]
# NB: Just taking major as specs version. May want to make this more specific in the future
specs_version = version[
: version.find(".")
] # version will always follow 'major.minor.patch' format
##########################################
## A. Machine Specs
##########################################
cpuinfo = path("/proc/cpuinfo").read_text()
meminfo = path("/proc/meminfo").read_text()
version = path("/proc/version").read_text()
os_release = path("/etc/os-release").read_text()
cpu_model = search(r"^model name\s*: (.*?)$", cpuinfo)
sbios = (
path("/sys/class/dmi/id/bios_vendor").read_text().strip()
+ path("/sys/class/dmi/id/bios_version").read_text().strip()
)
linux_kernel_version = search(r"version (\S*)", version)
amd_gpu_kernel_version = "" # TODO: Extract amdgpu kernel version
cpu_memory = search(r"MemTotal:\s*(\S*)", meminfo)
gpu_memory = "" # TODO: Extract gpu memory
linux_distro = search(r'PRETTY_NAME="(.*?)"', os_release)
if linux_distro is None:
linux_distro = ""
rocm_version = get_rocm_ver().strip()
# FIXME: use device
vbios = search(r"VBIOS version: (.*?)$", run(["rocm-smi", "-v"], exit_on_error=True))
compute_partition = search(
r"Compute Partition:\s*(\w+)", run(["rocm-smi", "--showcomputepartition"])
)
if compute_partition is None:
compute_partition = "NA"
memory_partition = search(
r"Memory Partition:\s*(\w+)", run(["rocm-smi", "--showmemorypartition"])
)
if memory_partition is None:
memory_partition = "NA"
##########################################
## B. SoC Specs
##########################################
# read rocminfo
rocminfo_full = run(["rocminfo"])
_rocminfo = rocminfo_full.split("\n")
gpu_arch, idx = detect_arch(_rocminfo)
_rocminfo = _rocminfo[idx + 1 :] # update rocminfo for target section
specs = MachineSpecs(
version=specs_version,
timestamp=timestamp,
_rocminfo=_rocminfo,
hostname=hostname,
cpu_model=cpu_model,
sbios=sbios,
linux_kernel_version=linux_kernel_version,
amd_gpu_kernel_version=amd_gpu_kernel_version,
cpu_memory=cpu_memory,
gpu_memory=gpu_memory,
linux_distro=linux_distro,
rocm_version=rocm_version,
vbios=vbios,
compute_partition=compute_partition,
memory_partition=memory_partition,
gpu_arch=gpu_arch,
)
# Load above SoC specs via module import
try:
soc_module = importlib.import_module("omniperf_soc.soc_" + specs.gpu_arch)
except ModuleNotFoundError as e:
error(
"Arch %s marked as supported, but couldn't find class implementation %s."
% (specs.gpu_arch, e)
)
soc_class = getattr(soc_module, specs.gpu_arch + "_soc")
soc_obj = soc_class(args, specs)
# Update arch specific specs
specs.total_l2_chan: str = total_l2_banks(
specs.gpu_model, int(specs._l2_banks), specs.memory_partition
)
specs.hbm_bw: str = str(int(specs.max_mclk) / 1000 * 32 * specs.get_hbm_channels())
return specs
@kw_only
@dataclass
class MachineSpecs:
##########################################
## A. Workload / Spec info
##########################################
# these three fields are special in that they're not included
# when you use (e.g.,) --specs to view the machinespecs, but they
# _are_ included in profiling/analysis, so we mark them as 'optional'
# in the metadata to avoid erroring out on missing fields on
# serialization
workload_name: str = field(
default=None,
metadata={
"doc": "The name of the workload data was collected for.",
"name": "Workload Name",
"optional": True,
},
)
command: str = field(
default=None,
metadata={
"doc": "The command the workload was executed with.",
"name": "Command",
"optional": True,
},
)
ip_blocks: str = field(
default=None,
metadata={
"doc": "The hardware blocks profiling information was collected for.",
"name": "IP Blocks",
"optional": True,
},
)
timestamp: str = field(
default=None,
metadata={
"doc": "The time (in local system time) when data was collected",
"name": "Timestamp",
},
)
version: str = field(
default=None,
metadata={
"doc": "The version of the machine specification file format.",
"name": "MachineSpecs Version",
"intable": False,
},
)
timestamp: str = field(
default=None,
metadata={
"doc": "The time (in local system time) when data was collected",
"name": "Timestamp",
},
)
_rocminfo: list = field(default=None)
##########################################
## A. Machine Specs
##########################################
hostname: str = field(
default=None, metadata={"doc": "The hostname of the machine.", "name": "Hostname"}
)
cpu_model: str = field(
default=None,
metadata={"doc": "The model name of the CPU used.", "name": "CPU Model"},
)
sbios: str = field(
default=None,
metadata={
"doc": "The system management bios version and vendor.",
"name": "SBIOS",
},
)
linux_distro: str = field(
default=None,
metadata={
"doc": "The Linux distribution installed on the machine.",
"name": "Linux Distribution",
},
)
linux_kernel_version: str = field(
default=None,
metadata={
"doc": "The Linux kernel version running on the machine.",
"name": "Linux Kernel Version",
},
)
amd_gpu_kernel_version: str = field(
default=None,
metadata={
"doc": "[RESERVED] The version of the AMDGPU driver installed on the machine. Unimplemented.",
"name": "AMD GPU Kernel Version",
},
)
cpu_memory: str = field(
default=None,
metadata={
"doc": "The total amount of memory available to the CPU.",
"unit": "KB",
"name": "CPU Memory",
},
)
gpu_memory: str = field(
default=None,
metadata={
"doc": "[RESERVED] The total amount of memory available to accelerators/GPUs in the system. Unimplemented.",
"unit": "KB",
"name": "GPU Memory",
},
)
rocm_version: str = field(
default=None,
metadata={
"doc": "The ROCm version used during data-collection.",
"name": "ROCm Version",
},
)
vbios: str = field(
default=None,
metadata={
"doc": "The version of the accelerators/GPUs video bios in the system.",
"name": "VBIOS",
},
)
compute_partition: str = field(
default=None,
metadata={
"doc": "The compute partitioning mode active on the accelerators/GPUs in the system (MI300 only).",
"name": "Compute Partition",
},
)
memory_partition: str = field(
default=None,
metadata={
"doc": "The memory partitioning mode active on the accelerators/GPUs in the system (MI300 only).",
"name": "Memory Partition",
},
)
##########################################
## B. SoC Specs
##########################################
gpu_model: str = field(
default=None,
metadata={
"doc": "The product name of the accelerators/GPUs in the system.",
"name": "GPU Model",
},
)
gpu_arch: str = field(
default=None,
metadata={
"doc": "The architecture name of the accelerators/GPUs in the system,\n"
"as used by (e.g.,) the AMDGPU backed of LLVM.",
"name": "GPU Arch",
},
)
gpu_l1: str = field(
default=None,
metadata={
"doc": "The size of the vL1D cache (per compute-unit) on the accelerators/GPUs in the system in KiB",
"name": "GPU L1",
},
)
gpu_l2: str = field(
default=None,
metadata={
"doc": "The size of the vL1D cache (per compute-unit) on the accelerators/GPUs in the system in KiB",
"name": "GPU L2",
},
)
cu_per_gpu: str = field(
default=None,
metadata={
"doc": "The total number of compute units per accelerator/GPU in the system. On systems with configurable\n"
"partitioning, (e.g., MI300) this is the total number of compute units in a partition.",
"name": "CU per GPU",
},
)
simd_per_cu: str = field(
default=None,
metadata={
"doc": "The number of SIMD processors in a compute unit for the accelerators/GPUs in the system.",
"name": "SIMD per CU",
},
)
se_per_gpu: str = field(
default=None,
metadata={
"doc": "The number of shader engines on the accelerators/GPUs in the system. On systems with configurable\n"
"partitioning, (e.g., MI300) this is the total number of shader engines in a partition.",
"name": "SE per GPU",
},
)
wave_size: str = field(
default=None,
metadata={
"doc": "The number work-items in a wavefront on the accelerators/GPUs in the system.",
"name": "Wave Size",
},
)
workgroup_max_size: str = field(
default=None,
metadata={
"doc": "The maximum number of work-items in a workgroup on the accelerators/GPUs in the system.",
"name": "Workgroup Max Size",
},
)
max_waves_per_cu: str = field(
default=None,
metadata={
"doc": "The maximum number of wavefronts that can be resident on a compute unit on the\n"
"accelerators/GPUs in the system",
"name": "Max Waves per CU",
},
)
max_sclk: str = field(
default=None,
metadata={
"doc": "The maximum engine (compute-unit) clock rate of the accelerators/GPUs in the system.",
"name": "Max SCLK",
"unit": "MHz",
},
)
max_mclk: str = field(
default=None,
metadata={
"doc": "The maximum memory clock rate of the accelerators/GPUs in the system.",
"name": "Max MCLK",
"unit": "MHz",
},
)
cur_sclk: str = field(
default=None,
metadata={
"doc": "[RESERVED] The current engine (compute unit) clock rate of the accelerators/GPUs in the system. Unused.",
"name": "Cur SCLK",
"unit": "MHz",
},
)
cur_mclk: str = field(
default=None,
metadata={
"doc": "[RESERVED] The current memory clock rate of the accelerators/GPUs in the system. Unused.",
"name": "Cur MCLK",
"unit": "MHz",
},
)
_l2_banks: str = None # NB: This only used in flatten_tcc_info_across_hbm_stacks()
total_l2_chan: str = field(
default=None,
metadata={
"doc": "The maximum number of L2 cache channels on the accelerators/GPUs in the system. On systems with\n"
"configurable partitioning, (e.g., MI300) this is the total number of L2 cache channels in a partition.",
"name": "Total L2 Channels",
},
)
lds_banks_per_cu: str = field(
default=None,
metadata={
"doc": "The number of banks in the LDS for a compute unit on the accelerators/GPUs in the system.",
"name": "LDS Banks per CU",
},
)
sqc_per_gpu: str = field(
default=None,
metadata={
"doc": "The number of L1I/sL1D caches on the accelerators/GPUs in the system. On systems with\n"
"configurable partitioning, (e.g., MI300) this is the total number of L1I/sL1D caches in a partition.",
"name": "SQC per GPU",
},
)
pipes_per_gpu: str = field(
default=None,
metadata={
"doc": "The number of scheduler-pipes on the accelerators/GPUs in the system.",
"name": "Pipes per GPU",
},
)
hbm_bw: str = field(
default=None,
metadata={
"doc": "The peak theoretical HBM bandwidth for the accelerators/GPUs in the system. On systems with\n"
"configurable partitioning, (e.g., MI300) this is the peak theoretical HBM bandwidth for a partition.",
"name": "HBM BW",
"unit": "MB/s",
},
)
num_xcd: str = field(
default=None,
metadata={
"doc": "The total number of accelerator complex dies in a compute partition on the accelerators/GPUs in the\n"
"system. For accelerators without partitioning (i.e., pre-MI300), this is considered to be one.",
"name": "Num XCDs",
"unit": "MB/s",
},
)
def get_hbm_channels(self):
hbmchannels = int(self.total_l2_chan)
if (
self.gpu_model.lower() == "mi300a_a0" or self.gpu_model.lower() == "mi300a_a1"
) and self.memory_partition.lower() == "nps1":
# we have an extra 32 channels for the CCD
hbmchannels += 32
return hbmchannels
def get_class_members(self):
all_populated = True
data = {}
# dataclass uses an OrderedDict for member variables, ensuring order consistency
for field in fields(self):
name = field.name
if not name.startswith("_"):
value = getattr(self, name)
if value is None:
# check if we've marked it optional
if (
field.metadata
and "optional" in field.metadata
and field.metadata["optional"]
):
pass
else:
# TODO: use proper logging function when that's merged
logging.warning(
f"WARNING: Incomplete class definition for {self.gpu_arch}. "
f"Expecting populated {name} but detected None."
)
all_populated = False
data[name] = value
if not all_populated:
error("Missing specs fields for %s" % self.gpu_arch)
return pd.DataFrame(data, index=[0])
def __repr__(self):
topstr = "Machine Specifications: describing the state of the machine that Omniperf data was collected on.\n"
data = []
for field in fields(self):
name = field.name
if not name.startswith("_"):
_data = {}
value = getattr(self, name)
if field.metadata:
# check out of table before any re-naming for pretty-printing
if "intable" in field.metadata and not field.metadata["intable"]:
if name == "version":
topstr += f"Output version: {value}\n"
else:
error(f"Unknown out of table printing field: {name}")
continue
if "name" in field.metadata:
name = field.metadata["name"]
if "unit" in field.metadata:
_data["Unit"] = field.metadata["unit"]
if "doc" in field.metadata:
_data["Description"] = field.metadata["doc"]
_data["Spec"] = name
_data["Value"] = value
data.append(_data)
df = pd.DataFrame(data)
columns = ["Spec", "Value"]
if "Description" in df.columns:
columns += ["Description"]
if "Unit" in df.columns:
columns += ["Unit"]
df = df[columns]
df = df.fillna("")
return topstr + get_table_string(df, transpose=False, decimal=2)
def get_rocm_ver():
rocm_found = False
for itr in VERSION_LOC:
_path = os.path.join(os.getenv("ROCM_PATH", "/opt/rocm"), ".info", itr)
if os.path.exists(_path):
rocm_ver = path(_path).read_text()
rocm_found = True
break
if not rocm_found:
# check if ROCM_VER is supplied externally
ROCM_VER_USER = os.getenv("ROCM_VER")
if ROCM_VER_USER is not None:
logging.info(
"Overriding missing ROCm version detection with ROCM_VER = %s"
% ROCM_VER_USER
)
rocm_ver = ROCM_VER_USER
else:
_rocm_path = os.getenv("ROCM_PATH", "/opt/rocm")
error(
"Unable to detect a complete local ROCm installation.\nThe expected %s/.info/ versioning directory is missing. Please ensure you have valid ROCm installation."
% _rocm_path
)
return rocm_ver
def run(cmd, exit_on_error=False):
try:
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except FileNotFoundError as e:
error(
f"Unable to parse specs. Can't find ROCm asset: {e.filename}\nTry passing a path to an existing workload results in 'analyze' mode."
)
if exit_on_error:
if cmd[0] == "rocm-smi":
if p.returncode != 2 and p.returncode != 0:
logging.error("ERROR: No GPU detected. Unable to load rocm-smi")
sys.exit(1)
elif p.returncode != 0:
logging.error("ERROR: command [%s] failed with non-zero exit code" % cmd)
sys.exit(1)
return p.stdout.decode("utf-8")
def search(pattern, string):
m = re.search(pattern, string, re.MULTILINE)
if m is not None:
return m.group(1)
return None
def total_l2_banks(archname, L2Banks, memory_partition):
# Fixme: support all supported partitioning mode
# Fixme: "name" is a bad name!
totalL2Banks = L2Banks
if archname.lower() == "mi300a_a0" or archname.lower() == "mi300a_a1":
totalL2Banks = L2Banks * get_hbm_stack_num(archname, memory_partition)
elif archname.lower() == "mi300x_a0" or archname.lower() == "mi300x_a1":
totalL2Banks = L2Banks * get_hbm_stack_num(archname, memory_partition)
return str(totalL2Banks)
def total_sqc(archname, numCUs, numSEs):
cu_per_se = float(numCUs) / float(numSEs)
sq_per_se = cu_per_se / 2
if archname.lower() in ["mi50", "mi100"]:
sq_per_se = cu_per_se / 3
sq_per_se = ceil(sq_per_se)
return int(sq_per_se) * int(numSEs)
def total_xcds(archname, compute_partition):
# check MI300 has a valid compute partition
mi300a_archs = ["mi300a_a0", "mi300a_a1"]
mi300x_archs = ["mi300x_a0", "mi300x_a1"]
if archname.lower() in mi300a_archs + mi300x_archs and compute_partition == "NA":
error("Invalid compute partition found for {}".format(archname))
if archname.lower() not in mi300a_archs + mi300x_archs:
return 1
# from the whitepaper
# https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
if compute_partition.lower() == "spx":
if archname.lower() in mi300a_archs:
return 6
if archname.lower() in mi300x_archs:
return 8
if compute_partition.lower() == "tpx":
if archname.lower() in mi300a_archs:
return 2
if compute_partition.lower() == "dpx":
if archname.lower() in mi300x_archs:
return 4
if compute_partition.lower() == "qpx":
if archname.lower() in mi300x_archs:
return 2
if compute_partition.lower() == "cpx":
if archname.lower() in mi300x_archs:
return 2
error(
"Unknown compute partition / arch found for {} / {}".format(
compute_partition, archname
)
)
if __name__ == "__main__":
print(generate_machine_specs())