diff --git a/src/omniperf_base.py b/src/omniperf_base.py index 527e9da926..5f67f6003d 100644 --- a/src/omniperf_base.py +++ b/src/omniperf_base.py @@ -29,32 +29,34 @@ import os from pathlib import Path import shutil from utils.specs import get_machine_specs -from utils.utils import demarcate, trace_logger, get_version, get_version_display, detect_rocprof, error +from utils.utils import demarcate, trace_logger, get_version, get_version_display, detect_rocprof, error, get_submodules from argparser import omniarg_parser import config import pandas as pd import importlib +SUPPORTED_ARCHS = { + "gfx906": {"mi50": ["MI50", "MI60"]}, + "gfx908": {"mi100": ["MI100"]}, + "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, + "gfx940": {"mi300": ["MI300A_A0"]}, + "gfx941": {"mi300": ["MI300X_A0"]}, + "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, +} + class Omniperf: def __init__(self): self.__args = None self.__profiler_mode = None self.__analyze_mode = None - self.__soc_name = set() #TODO: Should we make this a list? To accommodate analyze mode + self.__soc_name = set() # gpu name, or in case of analyze mode, all loaded gpu name(s) self.__soc = dict() # set of key, value pairs. Where arch->OmniSoc() obj self.__version = { "ver": None, "ver_pretty": None, } self.__options = {} - self.__supported_archs = { - "gfx906": {"mi50": ["MI50", "MI60"]}, - "gfx908": {"mi100": ["MI100"]}, - "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, - "gfx940": {"mi300": ["MI300A_A0"]}, - "gfx941": {"mi300": ["MI300X_A0"]}, - "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, - } + self.__supported_archs = SUPPORTED_ARCHS self.setup_logging() self.set_version() @@ -115,7 +117,7 @@ class Omniperf: self.__version["ver"] = vData["version"] self.__version["ver_pretty"] = get_version_display(vData["version"], vData["sha"], vData["mode"]) return - + def detect_profiler(self): #TODO: # Currently this will only be called in profile mode @@ -135,7 +137,7 @@ class Omniperf: elif str(rocprof_cmd).endswith("rocprofv2"): self.__profiler_mode = "rocprofv2" else: - error("Incompatible profiler. Please review documentation.") + error("Incompatible profiler: %s. Supported profilers include: %s" % (rocprof_cmd, get_submodules('omniperf_profile'))) return diff --git a/src/omniperf_profile/profiler_base.py b/src/omniperf_profile/profiler_base.py index 7e21ab5f63..9fe3d1264e 100644 --- a/src/omniperf_profile/profiler_base.py +++ b/src/omniperf_profile/profiler_base.py @@ -81,7 +81,6 @@ class OmniProfiler_Base(): # Remove old pmc_perf.txt input from perfmon dir os.remove(workload_perfmon_dir + "/pmc_perf.txt") - # joins disparate runs less dumbly than rocprof @demarcate def join_prof(self, out=None): """Manually join separated rocprof runs @@ -150,7 +149,7 @@ class OmniProfiler_Base(): logging.info(msg) # now, we can: - #   A) throw away any of the "boring" duplicats + #   A) throw away any of the "boring" duplicates df = df[ [ k diff --git a/src/omniperf_soc/soc_gfx906.py b/src/omniperf_soc/soc_gfx906.py index 661f742e26..a7fb73090b 100644 --- a/src/omniperf_soc/soc_gfx906.py +++ b/src/omniperf_soc/soc_gfx906.py @@ -27,6 +27,18 @@ import config from omniperf_soc.soc_base import OmniSoC_Base from utils.utils import demarcate, error +SOC_PARAM = { + "numSE": 4, + "numCU": 60, + "numSIMD": 240, + "numWavesPerCU": 40, + "numSQC": 15, + "L2Banks": 16, + "LDSBanks": 32, + "Freq": 1725, + "mclk": 1000 +} + class gfx906_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -49,19 +61,7 @@ class gfx906_soc (OmniSoC_Base): "TCC_channels": 16, } ) - self.set_soc_param( - { - "numSE": 4, - "numCU": 60, - "numSIMD": 240, - "numWavesPerCU": 40, - "numSQC": 15, - "L2Banks": 16, - "LDSBanks": 32, - "Freq": 1725, - "mclk": 1000 - } - ) + self.set_soc_param(SOC_PARAM) #----------------------- # Required child methods diff --git a/src/omniperf_soc/soc_gfx908.py b/src/omniperf_soc/soc_gfx908.py index dbbc259916..3b194ff429 100644 --- a/src/omniperf_soc/soc_gfx908.py +++ b/src/omniperf_soc/soc_gfx908.py @@ -27,6 +27,18 @@ import config from omniperf_soc.soc_base import OmniSoC_Base from utils.utils import demarcate, error +SOC_PARAM = { + "numSE": 8, + "numCU": 120, + "numSIMD": 480, + "numWavesPerCU": 40, + "numSQC": 30, + "L2Banks": 32, + "LDSBanks": 32, + "Freq": 1502, + "mclk": 1200 +} + class gfx908_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -49,19 +61,7 @@ class gfx908_soc (OmniSoC_Base): "TCC_channels": 32, } ) - self.set_soc_param( - { - "numSE": 8, - "numCU": 120, - "numSIMD": 480, - "numWavesPerCU": 40, - "numSQC": 30, - "L2Banks": 32, - "LDSBanks": 32, - "Freq": 1502, - "mclk": 1200 - } - ) + self.set_soc_param(SOC_PARAM) @demarcate def get_profiler_options(self): diff --git a/src/omniperf_soc/soc_gfx90a.py b/src/omniperf_soc/soc_gfx90a.py index 649724bfec..6d731ecc78 100644 --- a/src/omniperf_soc/soc_gfx90a.py +++ b/src/omniperf_soc/soc_gfx90a.py @@ -29,6 +29,18 @@ from utils.utils import demarcate, mibench from roofline import Roofline import logging +SOC_PARAM = { + "numSE": 8, + "numCU": 110, + "numSIMD": 440, + "numWavesPerCU": 32, + "numSQC": 56, + "L2Banks": 32, + "LDSBanks": 32, + "Freq": 1700, + "mclk": 1600 +} + class gfx90a_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -54,19 +66,7 @@ class gfx90a_soc (OmniSoC_Base): "TCC_channels": 32 } ) - self.set_soc_param( - { - "numSE": 8, - "numCU": 110, - "numSIMD": 440, - "numWavesPerCU": 32, - "numSQC": 56, - "L2Banks": 32, - "LDSBanks": 32, - "Freq": 1700, - "mclk": 1600 - } - ) + self.set_soc_param(SOC_PARAM) self.roofline_obj = Roofline(args) #----------------------- diff --git a/src/omniperf_soc/soc_gfx940.py b/src/omniperf_soc/soc_gfx940.py index 23be108977..bb15e49015 100644 --- a/src/omniperf_soc/soc_gfx940.py +++ b/src/omniperf_soc/soc_gfx940.py @@ -29,6 +29,18 @@ from utils.utils import demarcate, mibench from roofline import Roofline import logging +SOC_PARAM = { + "numSE": 8, + "numCU": 38, + "numSIMD": 4, + "numWavesPerCU": 32, + "numSQC": 56, + "L2Banks": 16, + "LDSBanks": 32, + "Freq": 1950, + "mclk": 1300 +} + class gfx940_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -55,19 +67,7 @@ class gfx940_soc (OmniSoC_Base): "TCC_channels": 32 } ) - self.set_soc_param( - { - "numSE": 8, - "numCU": 38, - "numSIMD": 4, - "numWavesPerCU": 32, - "numSQC": 56, - "L2Banks": 16, - "LDSBanks": 32, - "Freq": 1950, - "mclk": 1300 - } - ) + self.set_soc_param(SOC_PARAM) self.roofline_obj = Roofline(args) #----------------------- diff --git a/src/omniperf_soc/soc_gfx941.py b/src/omniperf_soc/soc_gfx941.py index 5dc248531f..6cd3201d69 100644 --- a/src/omniperf_soc/soc_gfx941.py +++ b/src/omniperf_soc/soc_gfx941.py @@ -29,6 +29,18 @@ from utils.utils import demarcate, mibench from roofline import Roofline import logging +SOC_PARAM = { + "numSE": 8, + "numCU": 38, + "numSIMD": 4, + "numWavesPerCU": 32, + "numSQC": 56, + "L2Banks": 16, + "LDSBanks": 32, + "Freq": 1950, + "mclk": 1300 +} + class gfx941_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -55,19 +67,7 @@ class gfx941_soc (OmniSoC_Base): "TCC_channels": 32 } ) - self.set_soc_param( - { - "numSE": 8, - "numCU": 38, - "numSIMD": 4, - "numWavesPerCU": 32, - "numSQC": 56, - "L2Banks": 16, - "LDSBanks": 32, - "Freq": 1950, - "mclk": 1300 - } - ) + self.set_soc_param(SOC_PARAM) self.roofline_obj = Roofline(args) #----------------------- diff --git a/src/omniperf_soc/soc_gfx942.py b/src/omniperf_soc/soc_gfx942.py index 2c6cfe69d1..87f63fe01e 100644 --- a/src/omniperf_soc/soc_gfx942.py +++ b/src/omniperf_soc/soc_gfx942.py @@ -29,6 +29,18 @@ from utils.utils import demarcate, mibench from roofline import Roofline import logging +SOC_PARAM = { + "numSE": 8, + "numCU": 38, + "numSIMD": 4, + "numWavesPerCU": 32, + "numSQC": 56, + "L2Banks": 16, + "LDSBanks": 32, + "Freq": 1950, + "mclk": 1300 +} + class gfx942_soc (OmniSoC_Base): def __init__(self,args): super().__init__(args) @@ -55,19 +67,7 @@ class gfx942_soc (OmniSoC_Base): "TCC_channels": 32 } ) - self.set_soc_param( - { - "numSE": 8, - "numCU": 38, - "numSIMD": 4, - "numWavesPerCU": 32, - "numSQC": 56, - "L2Banks": 16, - "LDSBanks": 32, - "Freq": 1950, - "mclk": 1300 - } - ) + self.set_soc_param(SOC_PARAM) self.roofline_obj = Roofline(args) #----------------------- diff --git a/src/utils/specs.py b/src/utils/specs.py index e52b9f2ecb..ff4897bee7 100644 --- a/src/utils/specs.py +++ b/src/utils/specs.py @@ -29,10 +29,13 @@ import re import sys import socket import subprocess +import importlib +import logging from dataclasses import dataclass from pathlib import Path as path from textwrap import dedent +from utils.utils import error @dataclass class MachineSpecs: @@ -101,8 +104,26 @@ class MachineSpecs: def gpuinfo(): - # Local var only for rocminfo searching - gpu_list = {"gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942"} + from omniperf_base import SUPPORTED_ARCHS + + gpu_info = { + "gpu_name": None, + "gpu_arch": None, + "L1": None, + "L2": None, + "max_sclk": None, + "num_CU": None, + "num_SIMD": None, + "num_SE": None, + "wave_size": None, + "grp_size": None, + "max_waves_per_cu": None, + "L2Banks": None, + "LDSBanks": None, + "numSQC": None, + "compute_partition": None, + "memory_partition": None, + } # Fixme: find better way to differentiate cards, GPU vs APU, etc. rocminfo_full = run(["rocminfo"]) @@ -110,135 +131,95 @@ def gpuinfo(): for idx1, linetext in enumerate(rocminfo): gpu_arch = search(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", linetext) - if gpu_arch in gpu_list: + if gpu_arch in SUPPORTED_ARCHS.keys(): break - if str(gpu_arch) in gpu_list: + if str(gpu_arch) in SUPPORTED_ARCHS.keys(): gpu_arch = str(gpu_arch) break - if not gpu_arch in gpu_list: - return ( - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - ) + if not gpu_arch in SUPPORTED_ARCHS.keys(): + return gpu_info - L1, L2 = "", "" + gpu_info['L1'], gpu_info['L1'] = "", "" for idx2, linetext in enumerate(rocminfo[idx1 + 1 :]): key = search(r"^\s*L1:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - L1 = key + gpu_info['L1'] = key continue key = search(r"^\s*L2:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - L2 = key + gpu_info['L2'] = key continue key = search(r"^\s*Max Clock Freq\. \(MHz\):\s+([0-9]+)", linetext) if key != None: - max_sclk = key + gpu_info['max_sclk'] = key continue key = search(r"^\s*Compute Unit:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - num_CU = key + gpu_info['num_CU'] = key continue key = search(r"^\s*SIMDs per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - num_SIMD = key + gpu_info['num_SIMD'] = key continue key = search(r"^\s*Shader Engines:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - num_SE = key + gpu_info['num_SE'] = key continue key = search(r"^\s*Wavefront Size:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - wave_size = key + gpu_info['wave_size'] = key continue key = search(r"^\s*Workgroup Max Size:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - grp_size = key + gpu_info['grp_size'] = key continue key = search(r"^\s*Max Waves Per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext) if key != None: - max_waves_per_cu = key + gpu_info['max_waves_per_cu'] = key break - gpu_name = "" - L2Banks = "" - LDSBanks = "32" - numSQC = "" - - if gpu_arch == "gfx906": - gpu_name = "MI50" - L2Banks = "16" - numSQC = str(int(num_CU) // 4) - elif gpu_arch == "gfx908": - gpu_name = "MI100" - L2Banks = "32" - numSQC = "48" - elif gpu_arch == "gfx90a": - L2Banks = "32" - gpu_name = "MI200" - numSQC = "56" - elif gpu_arch == "gfx940": - gpu_name = "MI300A_A0" - L2Banks = "16" - numSQC = "56" - elif gpu_arch == "gfx941": - gpu_name = "MI300X_A0" - L2Banks = "16" - numSQC = "56" - elif (gpu_arch == "gfx942") and ("MI300A" in rocminfo_full): + try: + soc_module = importlib.import_module('omniperf_soc.soc_'+gpu_arch) + except ModuleNotFoundError as e: + error("Arch %s marked as supported, but couldn't find class implementation %s." % (gpu_arch, e)) + + # load arch specific info + try: + gpu_name = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper() + gpu_info['L2Banks'] = str(soc_module.SOC_PARAM['L2Banks']) + gpu_info['numSQC'] = str(soc_module.SOC_PARAM['numSQC']) + gpu_info['LDSBanks'] = str(soc_module.SOC_PARAM['LDSBanks']) + except KeyError as e: + error("Incomplete class definition for %s. Expected a field for %s in SOC_PARAM." % (gpu_arch, e))\ + + # specify gpu name for gfx942 hardware + if gpu_name == "MI300": + gpu_name = list(SUPPORTED_ARCHS[gpu_arch].values())[0] + if (gpu_info['gpu_arch'] == "gfx942") and ("MI300A" in rocminfo_full): gpu_name = "MI300A_A1" - L2Banks = "16" - numSQC = "56" - elif (gpu_arch == "gfx942") and ("MI300A" not in rocminfo_full): + if (gpu_arch == "gfx942") and ("MI300A" not in rocminfo_full): gpu_name = "MI300X_A1" - L2Banks = "16" - numSQC = "56" - else: - print("\nInvalid SoC") - sys.exit(0) + - compute_partition = "" - memory_partition = "" - return ( - gpu_name, - gpu_arch, - L1, - L2, - max_sclk, - num_CU, - num_SIMD, - num_SE, - wave_size, - grp_size, - max_waves_per_cu, - L2Banks, - LDSBanks, - numSQC, - compute_partition, - memory_partition, - ) + gpu_info['gpu_name'] = gpu_name + gpu_info['gpu_arch'] = gpu_arch + gpu_info['compute_partition'] = "" + gpu_info['memory_partition'] = "" + + # verify all fields are filled + for key, value in gpu_info.items(): + if value is None: + logging.info("Warning: %s is missing from gpu_info dictionary." % key) + + return gpu_info def run(cmd): @@ -300,24 +281,7 @@ def get_machine_specs(devicenum): print("ensure you have valid ROCm installation.") sys.exit(1) - ( - gpu_name, - gpu_arch, - L1, - L2, - max_sclk, - num_CU, - num_SIMD, - num_SE, - wave_size, - grp_size, - max_waves_per_cu, - L2Banks, - LDSBanks, - numSQC, - compute_partition, - memory_partition, - ) = gpuinfo() + gpu_info = gpuinfo() rocm_smi = run(["rocm-smi"]) @@ -372,23 +336,23 @@ def get_machine_specs(devicenum): ram, distro, rocm_version, - gpu_name, - gpu_arch, + gpu_info['gpu_name'], + gpu_info['gpu_arch'], vbios, - L1, - L2, - num_CU, - num_SIMD, - num_SE, - wave_size, - grp_size, - max_sclk, + gpu_info['L1'], + gpu_info['L2'], + gpu_info['num_CU'], + gpu_info['num_SIMD'], + gpu_info['num_SE'], + gpu_info['wave_size'], + gpu_info['grp_size'], + gpu_info['max_sclk'], cur_sclk, cur_mclk, - max_waves_per_cu, - L2Banks, - LDSBanks, - numSQC, + gpu_info['max_waves_per_cu'], + gpu_info['L2Banks'], + gpu_info['LDSBanks'], + gpu_info['numSQC'], hbmBW, compute_partition, memory_partition, diff --git a/src/utils/utils.py b/src/utils/utils.py index 403a13acdb..64004b46dc 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -556,4 +556,22 @@ def get_hbm_stack_num(gpu_name, memory_partition): else: # Fixme: add proper numbers for other archs return -1 + +def get_submodules(package_name): + """List all submodules for a target package + """ + import importlib + import pkgutil + + submodules = [] + + # walk all submodules in target package + package = importlib.import_module(package_name) + for _, name, _ in pkgutil.walk_packages(package.__path__): + pretty_name = name.split("_", 1)[1].replace("_", "") + # ignore base submodule, add all other + if pretty_name != "base": + submodules.append(pretty_name) + + return submodules \ No newline at end of file