##############################################################################bl # MIT License # # Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ##############################################################################el import sys, os, pathlib, shutil, subprocess, argparse, glob, re import numpy as np import math import warnings import pandas as pd prog = "omniperf" # Per IP block max number of simulutaneous counters # GFX IP Blocks perfmon_config = { "vega10": { "SQ": 8, "TA": 2, "TD": 2, "TCP": 4, "TCC": 4, "CPC": 2, "CPF": 2, "SPI": 2, "GRBM": 2, "GDS": 4, "TCC_channels": 16, }, "mi50": { "SQ": 8, "TA": 2, "TD": 2, "TCP": 4, "TCC": 4, "CPC": 2, "CPF": 2, "SPI": 2, "GRBM": 2, "GDS": 4, "TCC_channels": 16, }, "mi100": { "SQ": 8, "TA": 2, "TD": 2, "TCP": 4, "TCC": 4, "CPC": 2, "CPF": 2, "SPI": 2, "GRBM": 2, "GDS": 4, "TCC_channels": 32, }, "mi200": { "SQ": 8, "TA": 2, "TD": 2, "TCP": 4, "TCC": 4, "CPC": 2, "CPF": 2, "SPI": 2, "GRBM": 2, "GDS": 4, "TCC_channels": 32, }, } def test_df_column_equality(df): return df.eq(df.iloc[:, 0], axis=0).all(1).all() # joins disparate runs less dumbly than rocprof def join_prof(workload_dir, join_type, log_file, verbose, out=None): # Set default output directory if not specified if out == None: out = workload_dir + "/pmc_perf.csv" files = glob.glob(workload_dir + "/" + "pmc_perf_*.csv") df = None for i, file in enumerate(files): _df = pd.read_csv(file) if join_type == "kernel": key = _df.groupby("KernelName").cumcount() elif join_type == "grid": key = _df.groupby(["KernelName", "grd"]).cumcount() else: print("ERROR: Unrecognized --join-type") sys.exit(1) _df["key"] = _df.KernelName + " - " + key.astype(str) if df is None: df = _df else: # join by unique index of kernel df = pd.merge(df, _df, how="inner", on="key", suffixes=("", f"_{i}")) # TODO: check for any mismatch in joins duplicate_cols = { "gpu": [col for col in df.columns if "gpu" in col], "grd": [col for col in df.columns if "grd" in col], "wpr": [col for col in df.columns if "wgr" in col], "lds": [col for col in df.columns if "lds" in col], "scr": [col for col in df.columns if "scr" in col], "arch_vgpr": [col for col in df.columns if "arch_vgpr" in col], "accum_vgpr": [col for col in df.columns if "accum_vgpr" in col], "spgr": [col for col in df.columns if "sgpr" in col], } for key, cols in duplicate_cols.items(): _df = df[cols] if not test_df_column_equality(_df): msg = ( "WARNING: Detected differing {} values while joining pmc_perf.csv".format( key ) ) warnings.warn(msg) log_file.write(msg + "\n") if test_df_column_equality(_df) and verbose: msg = "Successfully joined {} in pmc_perf.csv".format(key) print(msg) log_file.write(msg + "\n") # now, we can: #   A) throw away any of the "boring" duplicats df = df[ [ k for k in df.keys() if not any( check in k for check in [ # removed merged counters, keep original "gpu-id_", "grd_", "wgr_", "lds_", "scr_", "vgpr_", "sgpr_", "Index_", # un-mergable, remove all "queue-id", "queue-index", "pid", "tid", "fbar", "sig", "obj", ] ) ] ] #   B) any timestamps that are _not_ the duration, which is the one we care #   about df = df[ [ k for k in df.keys() if not any(check in k for check in ["DispatchNs", "CompleteNs"]) ] ] #   C) sanity check the name and key namekeys = [k for k in df.keys() if "KernelName" in k] assert len(namekeys) for k in namekeys[1:]: assert (df[namekeys[0]] == df[k]).all() df = df.drop(columns=namekeys[1:]) # now take the median of the durations bkeys = [] ekeys = [] for k in df.keys(): if "Begin" in k: bkeys.append(k) if "End" in k: ekeys.append(k) # compute mean begin and end timestamps endNs = df[ekeys].mean(axis=1) beginNs = df[bkeys].mean(axis=1) # and replace df = df.drop(columns=bkeys) df = df.drop(columns=ekeys) df["BeginNs"] = beginNs df["EndNs"] = endNs # finally, join the drop key df = df.drop(columns=["key"]) # and save to file df.to_csv(out, index=False) # and delete old file(s) if not verbose: for file in files: os.remove(file) def pmc_perf_split(workload_dir): workload_perfmon_dir = workload_dir + "/perfmon" lines = open(workload_perfmon_dir + "/pmc_perf.txt", "r").read().splitlines() # Iterate over each line in pmc_perf.txt mpattern = r"^pmc:(.*)" i = 0 for line in lines: # Verify no comments stext = line.split("#")[0].strip() if not stext: continue # all pmc counters start with "pmc:" m = re.match(mpattern, stext) if m is None: continue # Create separate file for each line fd = open(workload_perfmon_dir + "/pmc_perf_" + str(i) + ".txt", "w") fd.write(stext + "\n\n") fd.write("gpu:\n") fd.write("range:\n") fd.write("kernel:\n") fd.close() i += 1 # Remove old pmc_perf.txt input from perfmon dir os.remove(workload_perfmon_dir + "/pmc_perf.txt") def perfmon_coalesce(pmc_files_list, workload_dir, soc): workload_perfmon_dir = workload_dir + "/perfmon" # match pattern for pmc counters mpattern = r"^pmc:(.*)" pmc_list = dict( [ ("SQ", []), ("GRBM", []), ("TCP", []), ("TA", []), ("TD", []), ("TCC", []), ("SPI", []), ("CPC", []), ("CPF", []), ("GDS", []), ("TCC2", {}), # per-channel TCC perfmon ] ) for ch in range(perfmon_config[soc]["TCC_channels"]): pmc_list["TCC2"][str(ch)] = [] # Extract all PMC counters and store in separate buckets for fname in pmc_files_list: lines = open(fname, "r").read().splitlines() for line in lines: # Strip all comements, skip empty lines stext = line.split("#")[0].strip() if not stext: continue # all pmc counters start with "pmc:" m = re.match(mpattern, stext) if m is None: continue # we have found all the counters, store them in buckets counters = m.group(1).split() if "SQ_ACCUM_PREV_HIRES" in counters: # save all level counters separately nindex = counters.index("SQ_ACCUM_PREV_HIRES") level_counter = counters[nindex - 1] # Save to level counter file, file name = level counter name fd = open(workload_perfmon_dir + "/" + level_counter + ".txt", "w") fd.write(stext + "\n\n") fd.write("gpu:\n") fd.write("range:\n") fd.write("kernel:\n") fd.close() continue # save normal pmc counters in matching buckets for counter in counters: IP_block = counter.split(sep="_")[0].upper() # SQC and SQ belong to the IP block, coalesce them if IP_block == "SQC": IP_block = "SQ" if IP_block != "TCC": # Insert unique pmc counters into its bucket if counter not in pmc_list[IP_block]: pmc_list[IP_block].append(counter) else: # TCC counters processing m = re.match(r"[\s\S]+\[(\d+)\]", counter) if m is None: # Aggregated TCC counters if counter not in pmc_list[IP_block]: pmc_list[IP_block].append(counter) else: # TCC channel ID ch = m.group(1) # fake IP block for per channel TCC if str(ch) in pmc_list["TCC2"]: # append unique counter into the channel if counter not in pmc_list["TCC2"][str(ch)]: pmc_list["TCC2"][str(ch)].append(counter) else: # initial counter in this channel pmc_list["TCC2"][str(ch)] = [counter] # sort the per channel counter, so that same counter in all channels can be aligned for ch in range(perfmon_config[soc]["TCC_channels"]): pmc_list["TCC2"][str(ch)].sort() return pmc_list def perfmon_emit(pmc_list, workload_dir, soc): workload_perfmon_dir = workload_dir + "/perfmon" # Calculate the minimum number of iteration to save the pmc counters # non-TCC counters pmc_cnt = [ len(pmc_list[key]) / perfmon_config[soc][key] for key in pmc_list if key not in ["TCC", "TCC2"] ] # TCC counters tcc_channels = perfmon_config[soc]["TCC_channels"] tcc_cnt = len(pmc_list["TCC"]) / perfmon_config[soc]["TCC"] tcc2_cnt = ( np.array([len(pmc_list["TCC2"][str(ch)]) for ch in range(tcc_channels)]) / perfmon_config[soc]["TCC"] ) # Total number iterations to write pmc: counters line niter = max(math.ceil(max(pmc_cnt)), math.ceil(tcc_cnt) + math.ceil(max(tcc2_cnt))) # Emit PMC counters into pmc config file fd = open(workload_perfmon_dir + "/pmc_perf.txt", "w") tcc2_index = 0 for iter in range(niter): # Prefix line = "pmc: " # Add all non-TCC counters for key in pmc_list: if key not in ["TCC", "TCC2"]: N = perfmon_config[soc][key] ip_counters = pmc_list[key][iter * N : iter * N + N] if ip_counters: line = line + " " + " ".join(ip_counters) # Add TCC counters N = perfmon_config[soc]["TCC"] tcc_counters = pmc_list["TCC"][iter * N : iter * N + N] if not tcc_counters: # TCC per-channel counters for ch in range(perfmon_config[soc]["TCC_channels"]): tcc_counters += pmc_list["TCC2"][str(ch)][ tcc2_index * N : tcc2_index * N + N ] tcc2_index += 1 # TCC aggregated counters line = line + " " + " ".join(tcc_counters) fd.write(line + "\n") fd.write("\ngpu:\n") fd.write("range:\n") fd.write("kernel:\n") fd.close() def perfmon_filter(workload_dir, perfmon_dir, args): workload_perfmon_dir = workload_dir + "/perfmon" soc = args.target # Initialize directories # TODO: Modify this so that data is appended to previous? if not os.path.isdir(workload_dir): os.makedirs(workload_dir) else: shutil.rmtree(workload_dir) os.makedirs(workload_perfmon_dir) ref_pmc_files_list = glob.glob(perfmon_dir + "/" + "pmc_*perf*.txt") ref_pmc_files_list += glob.glob(perfmon_dir + "/" + soc + "/pmc_*_perf*.txt") # Perfmon list filtering if args.ipblocks != None: for i in range(len(args.ipblocks)): args.ipblocks[i] = args.ipblocks[i].lower() mpattern = "pmc_([a-zA-Z0-9_]+)_perf*" pmc_files_list = [] for fname in ref_pmc_files_list: fbase = os.path.splitext(os.path.basename(fname))[0] ip = re.match(mpattern, fbase).group(1) if ip in args.ipblocks: pmc_files_list.append(fname) print("fname: " + fbase + ": Added") else: print("fname: " + fbase + ": Skipped") else: # default: take all perfmons pmc_files_list = ref_pmc_files_list # Coalesce and writeback workload specific perfmon pmc_list = perfmon_coalesce(pmc_files_list, workload_dir, soc) perfmon_emit(pmc_list, workload_dir, soc) def pmc_filter(workload_dir, perfmon_dir, soc): workload_perfmon_dir = workload_dir + "/perfmon" if not os.path.isdir(workload_perfmon_dir): os.makedirs(workload_perfmon_dir) else: shutil.rmtree(workload_perfmon_dir) ref_pmc_files_list = glob.glob(perfmon_dir + "/roofline/" + "pmc_roof_perf.txt") # ref_pmc_files_list += glob.glob(perfmon_dir + "/" + soc + "/pmc_*_perf*.txt") pmc_files_list = ref_pmc_files_list # Coalesce and writeback workload specific perfmon pmc_list = perfmon_coalesce(pmc_files_list, workload_dir, soc) perfmon_emit(pmc_list, workload_dir, soc)