##############################################################################bl
# MIT License
#
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
##############################################################################el

import sys, os, pathlib, shutil, subprocess, argparse, glob, re
import numpy as np
import math
import warnings
import pandas as pd

prog = "omniperf"

# Per IP block max number of simulutaneous counters
# GFX IP Blocks
perfmon_config = {
    "vega10": {
        "SQ": 8,
        "TA": 2,
        "TD": 2,
        "TCP": 4,
        "TCC": 4,
        "CPC": 2,
        "CPF": 2,
        "SPI": 2,
        "GRBM": 2,
        "GDS": 4,
        "TCC_channels": 16,
    },
    "mi50": {
        "SQ": 8,
        "TA": 2,
        "TD": 2,
        "TCP": 4,
        "TCC": 4,
        "CPC": 2,
        "CPF": 2,
        "SPI": 2,
        "GRBM": 2,
        "GDS": 4,
        "TCC_channels": 16,
    },
    "mi100": {
        "SQ": 8,
        "TA": 2,
        "TD": 2,
        "TCP": 4,
        "TCC": 4,
        "CPC": 2,
        "CPF": 2,
        "SPI": 2,
        "GRBM": 2,
        "GDS": 4,
        "TCC_channels": 32,
    },
    "mi200": {
        "SQ": 8,
        "TA": 2,
        "TD": 2,
        "TCP": 4,
        "TCC": 4,
        "CPC": 2,
        "CPF": 2,
        "SPI": 2,
        "GRBM": 2,
        "GDS": 4,
        "TCC_channels": 32,
    },
}


def test_df_column_equality(df):
    return df.eq(df.iloc[:, 0], axis=0).all(1).all()


# joins disparate runs less dumbly than rocprof
def join_prof(workload_dir, join_type, log_file, verbose, out=None):
    # Set default output directory if not specified
    if out == None:
        out = workload_dir + "/pmc_perf.csv"
    files = glob.glob(workload_dir + "/" + "pmc_perf_*.csv")
    df = None

    for i, file in enumerate(files):
        _df = pd.read_csv(file)
        if join_type == "kernel":
            key = _df.groupby("KernelName").cumcount()
        elif join_type == "grid":
            key = _df.groupby(["KernelName", "grd"]).cumcount()
        else:
            print("ERROR: Unrecognized --join-type")
            sys.exit(1)

        _df["key"] = _df.KernelName + " - " + key.astype(str)
        if df is None:
            df = _df
        else:
            # join by unique index of kernel
            df = pd.merge(df, _df, how="inner", on="key", suffixes=("", f"_{i}"))

    # TODO: check for any mismatch in joins
    duplicate_cols = {
        "gpu": [col for col in df.columns if "gpu" in col],
        "grd": [col for col in df.columns if "grd" in col],
        "wpr": [col for col in df.columns if "wgr" in col],
        "lds": [col for col in df.columns if "lds" in col],
        "scr": [col for col in df.columns if "scr" in col],
        "arch_vgpr": [col for col in df.columns if "arch_vgpr" in col],
        "accum_vgpr": [col for col in df.columns if "accum_vgpr" in col],
        "spgr": [col for col in df.columns if "sgpr" in col],
    }
    for key, cols in duplicate_cols.items():
        _df = df[cols]
        if not test_df_column_equality(_df):
            msg = (
                "WARNING: Detected differing {} values while joining pmc_perf.csv".format(
                    key
                )
            )
            warnings.warn(msg)
            log_file.write(msg + "\n")
        if test_df_column_equality(_df) and verbose:
            msg = "Successfully joined {} in pmc_perf.csv".format(key)
            print(msg)
            log_file.write(msg + "\n")

    # now, we can:
    #   A) throw away any of the "boring" duplicats
    df = df[
        [
            k
            for k in df.keys()
            if not any(
                check in k
                for check in [
                    # removed merged counters, keep original
                    "gpu-id_",
                    "grd_",
                    "wgr_",
                    "lds_",
                    "scr_",
                    "vgpr_",
                    "sgpr_",
                    "Index_",
                    # un-mergable, remove all
                    "queue-id",
                    "queue-index",
                    "pid",
                    "tid",
                    "fbar",
                    "sig",
                    "obj",
                ]
            )
        ]
    ]
    #   B) any timestamps that are _not_ the duration, which is the one we care
    #   about
    df = df[
        [
            k
            for k in df.keys()
            if not any(check in k for check in ["DispatchNs", "CompleteNs"])
        ]
    ]
    #   C) sanity check the name and key
    namekeys = [k for k in df.keys() if "KernelName" in k]
    assert len(namekeys)
    for k in namekeys[1:]:
        assert (df[namekeys[0]] == df[k]).all()
    df = df.drop(columns=namekeys[1:])
    # now take the median of the durations
    bkeys = []
    ekeys = []
    for k in df.keys():
        if "Begin" in k:
            bkeys.append(k)
        if "End" in k:
            ekeys.append(k)
    # compute mean begin and end timestamps
    endNs = df[ekeys].mean(axis=1)
    beginNs = df[bkeys].mean(axis=1)
    # and replace
    df = df.drop(columns=bkeys)
    df = df.drop(columns=ekeys)
    df["BeginNs"] = beginNs
    df["EndNs"] = endNs
    # finally, join the drop key
    df = df.drop(columns=["key"])
    # and save to file
    df.to_csv(out, index=False)
    # and delete old file(s)
    if not verbose:
        for file in files:
            os.remove(file)


def pmc_perf_split(workload_dir):
    workload_perfmon_dir = workload_dir + "/perfmon"
    lines = open(workload_perfmon_dir + "/pmc_perf.txt", "r").read().splitlines()

    # Iterate over each line in pmc_perf.txt
    mpattern = r"^pmc:(.*)"
    i = 0
    for line in lines:
        # Verify no comments
        stext = line.split("#")[0].strip()
        if not stext:
            continue

        # all pmc counters start with  "pmc:"
        m = re.match(mpattern, stext)
        if m is None:
            continue

        # Create separate file for each line
        fd = open(workload_perfmon_dir + "/pmc_perf_" + str(i) + ".txt", "w")
        fd.write(stext + "\n\n")
        fd.write("gpu:\n")
        fd.write("range:\n")
        fd.write("kernel:\n")
        fd.close()

        i += 1

    # Remove old pmc_perf.txt input from perfmon dir
    os.remove(workload_perfmon_dir + "/pmc_perf.txt")


def perfmon_coalesce(pmc_files_list, workload_dir, soc):
    workload_perfmon_dir = workload_dir + "/perfmon"

    # match pattern for pmc counters
    mpattern = r"^pmc:(.*)"
    pmc_list = dict(
        [
            ("SQ", []),
            ("GRBM", []),
            ("TCP", []),
            ("TA", []),
            ("TD", []),
            ("TCC", []),
            ("SPI", []),
            ("CPC", []),
            ("CPF", []),
            ("GDS", []),
            ("TCC2", {}),  # per-channel TCC perfmon
        ]
    )
    for ch in range(perfmon_config[soc]["TCC_channels"]):
        pmc_list["TCC2"][str(ch)] = []

    # Extract all PMC counters and store in separate buckets
    for fname in pmc_files_list:
        lines = open(fname, "r").read().splitlines()

        for line in lines:
            # Strip all comements, skip empty lines
            stext = line.split("#")[0].strip()
            if not stext:
                continue

            # all pmc counters start with  "pmc:"
            m = re.match(mpattern, stext)
            if m is None:
                continue

            # we have found all the counters, store them in buckets
            counters = m.group(1).split()
            if "SQ_ACCUM_PREV_HIRES" in counters:
                # save  all level counters separately

                nindex = counters.index("SQ_ACCUM_PREV_HIRES")
                level_counter = counters[nindex - 1]

                # Save to level counter file, file name = level counter name
                fd = open(workload_perfmon_dir + "/" + level_counter + ".txt", "w")
                fd.write(stext + "\n\n")
                fd.write("gpu:\n")
                fd.write("range:\n")
                fd.write("kernel:\n")
                fd.close()

                continue

            # save normal pmc counters in matching buckets
            for counter in counters:
                IP_block = counter.split(sep="_")[0].upper()
                # SQC and SQ belong to the IP block, coalesce them
                if IP_block == "SQC":
                    IP_block = "SQ"

                if IP_block != "TCC":
                    # Insert unique pmc counters into its bucket
                    if counter not in pmc_list[IP_block]:
                        pmc_list[IP_block].append(counter)

                else:
                    # TCC counters processing
                    m = re.match(r"[\s\S]+\[(\d+)\]", counter)
                    if m is None:
                        # Aggregated TCC counters
                        if counter not in pmc_list[IP_block]:
                            pmc_list[IP_block].append(counter)

                    else:
                        # TCC channel ID
                        ch = m.group(1)

                        # fake IP block for per channel TCC
                        if str(ch) in pmc_list["TCC2"]:
                            # append unique counter into the channel
                            if counter not in pmc_list["TCC2"][str(ch)]:
                                pmc_list["TCC2"][str(ch)].append(counter)
                        else:
                            # initial counter in this channel
                            pmc_list["TCC2"][str(ch)] = [counter]

    # sort the per channel counter, so that same counter in all channels can be aligned
    for ch in range(perfmon_config[soc]["TCC_channels"]):
        pmc_list["TCC2"][str(ch)].sort()

    return pmc_list


def perfmon_emit(pmc_list, workload_dir, soc):
    workload_perfmon_dir = workload_dir + "/perfmon"

    # Calculate the minimum number of iteration to save the pmc counters
    # non-TCC counters
    pmc_cnt = [
        len(pmc_list[key]) / perfmon_config[soc][key]
        for key in pmc_list
        if key not in ["TCC", "TCC2"]
    ]

    # TCC counters
    tcc_channels = perfmon_config[soc]["TCC_channels"]

    tcc_cnt = len(pmc_list["TCC"]) / perfmon_config[soc]["TCC"]
    tcc2_cnt = (
        np.array([len(pmc_list["TCC2"][str(ch)]) for ch in range(tcc_channels)])
        / perfmon_config[soc]["TCC"]
    )

    # Total number iterations to write pmc: counters line
    niter = max(math.ceil(max(pmc_cnt)), math.ceil(tcc_cnt) + math.ceil(max(tcc2_cnt)))

    # Emit PMC counters into pmc config file
    fd = open(workload_perfmon_dir + "/pmc_perf.txt", "w")

    tcc2_index = 0
    for iter in range(niter):
        # Prefix
        line = "pmc: "

        # Add all non-TCC counters
        for key in pmc_list:
            if key not in ["TCC", "TCC2"]:
                N = perfmon_config[soc][key]
                ip_counters = pmc_list[key][iter * N : iter * N + N]
                if ip_counters:
                    line = line + " " + " ".join(ip_counters)

        # Add TCC counters
        N = perfmon_config[soc]["TCC"]
        tcc_counters = pmc_list["TCC"][iter * N : iter * N + N]

        if not tcc_counters:
            # TCC per-channel counters
            for ch in range(perfmon_config[soc]["TCC_channels"]):
                tcc_counters += pmc_list["TCC2"][str(ch)][
                    tcc2_index * N : tcc2_index * N + N
                ]

            tcc2_index += 1

        # TCC aggregated counters
        line = line + " " + " ".join(tcc_counters)
        fd.write(line + "\n")

    fd.write("\ngpu:\n")
    fd.write("range:\n")
    fd.write("kernel:\n")
    fd.close()


def perfmon_filter(workload_dir, perfmon_dir, args):
    workload_perfmon_dir = workload_dir + "/perfmon"
    soc = args.target

    # Initialize directories
    # TODO: Modify this so that data is appended to previous?
    if not os.path.isdir(workload_dir):
        os.makedirs(workload_dir)
    else:
        shutil.rmtree(workload_dir)

    os.makedirs(workload_perfmon_dir)

    ref_pmc_files_list = glob.glob(perfmon_dir + "/" + "pmc_*perf*.txt")
    ref_pmc_files_list += glob.glob(perfmon_dir + "/" + soc + "/pmc_*_perf*.txt")

    # Perfmon list filtering
    if args.ipblocks != None:
        for i in range(len(args.ipblocks)):
            args.ipblocks[i] = args.ipblocks[i].lower()
        mpattern = "pmc_([a-zA-Z0-9_]+)_perf*"

        pmc_files_list = []
        for fname in ref_pmc_files_list:
            fbase = os.path.splitext(os.path.basename(fname))[0]
            ip = re.match(mpattern, fbase).group(1)
            if ip in args.ipblocks:
                pmc_files_list.append(fname)
                print("fname: " + fbase + ": Added")
            else:
                print("fname: " + fbase + ": Skipped")

    else:
        # default: take all perfmons
        pmc_files_list = ref_pmc_files_list

    # Coalesce and writeback workload specific perfmon
    pmc_list = perfmon_coalesce(pmc_files_list, workload_dir, soc)
    perfmon_emit(pmc_list, workload_dir, soc)


def pmc_filter(workload_dir, perfmon_dir, soc):
    workload_perfmon_dir = workload_dir + "/perfmon"

    if not os.path.isdir(workload_perfmon_dir):
        os.makedirs(workload_perfmon_dir)
    else:
        shutil.rmtree(workload_perfmon_dir)

    ref_pmc_files_list = glob.glob(perfmon_dir + "/roofline/" + "pmc_roof_perf.txt")
    # ref_pmc_files_list += glob.glob(perfmon_dir + "/" + soc + "/pmc_*_perf*.txt")

    pmc_files_list = ref_pmc_files_list

    # Coalesce and writeback workload specific perfmon
    pmc_list = perfmon_coalesce(pmc_files_list, workload_dir, soc)
    perfmon_emit(pmc_list, workload_dir, soc)