amd-strix-halo-vllm-toolboxes/docs/parse_results.py


import os
import json
import re
from pathlib import Path

# Config
BENCHMARK_DIR = Path("../benchmarks/benchmark_results")
OUTPUT_FILE = Path("results.json")

# Regex to parse model name for quantization and parameters
# Examples:
# "meta-llama/Meta-Llama-3.1-8B-In
# struct"
# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit"
# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block"
PARAMS_REGEX = r"(\d+(?:\.\d+)?)B"
QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)"

def extract_meta(model_name):
    # Params
    params_match = re.search(PARAMS_REGEX, model_name, re.IGNORECASE)
    params_b = float(params_match.group(1)) if params_match else None

    # Quant
    quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE)
    quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown.
    # Refine quant if 4bit
    if quant == "4BIT" or quant == "INT4":
        if "GPTQ" in model_name: quant = "GPTQ-4bit"
        elif "AWQ" in model_name: quant = "AWQ-4bit"
        else: quant = "4-bit"

    return params_b, quant

def parse_logs():
    runs = []

    if not BENCHMARK_DIR.exists():
        print(f"Error: {BENCHMARK_DIR} does not exist!")
        return []

    print(f"Scanning {BENCHMARK_DIR}...")

    # Files are flat in the dir: {model_safe}_tp{tp}_{type}.json
    # or latency: {model_safe}_tp{tp}_qps{q}_latency.json

    # We need to group by (model, tp) to form cohesive records if we want,
    # BUT the webapp expects a list of "runs".
    # Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384")
    # Actually, looking at the provided valid example:
    # "test": "pp512", "tps_mean": 2708.86 ...

    # Our data:
    # throughput.json -> tokens_per_second. This is usually "decoding" or a mix?
    # vLLM bench throughput usually streams tokens.
    # Let's look at what run_vllm_bench.py produces.
    # Throughput: --input-len 1024 --output-len 512.
    # This is effectively a mixed batch.
    # We'll label it "Throughput (1024/512)" or just "Throughput"

    # Latency: qps-based.

    files = list(BENCHMARK_DIR.glob("*.json"))

    for f in files:
        fname = f.name
        try:
            data = json.loads(f.read_text())
        except:
            print(f"Skipping bad JSON: {fname}")
            continue

        # Infer metadata from filename
        # Format: {model_safe}_tp{tp}_{suffix}
        # Suffix can be: "throughput.json" or "qps{q}_latency.json"

        # We need model name. The script replaces / with _ in filenames.
        # But we verify against the known models list? Or just parse string.
        # We can reconstruct roughly.

        # Split by "_tp" which is a strong delimiter
        parts = fname.split("_tp")
        if len(parts) < 2: continue

        model_part = parts[0]
        rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json"

        # TP
        tp_match = re.match(r"^(\d+)", rest)
        if not tp_match: continue
        tp = int(tp_match.group(1))

        # Env mapping
        env = f"TP{tp}"

        # Model Name Restoration (best effort or matching)
        # In the script: model.replace("/", "_")
        # We can reverse this if we have the list, but for now let's just use the clean string?
        # The webapp uses "model_clean" and "model".
        # Let's assume standard "org_model" format -> "org/model"
        if "_" in model_part:
            # Heuristic: First _ is likely the slash
            model_display = model_part.replace("_", "/", 1)
        else:
            model_display = model_part

        params_b, quant = extract_meta(model_display)

        base_run = {
            "model": model_display,
            "model_clean": model_display,
            "env": env,
            "gpu_config": "dual" if tp > 1 else "single",
            "quant": quant,
            "params_b": params_b,
            "name_params_b": params_b,
            # Defaults
            "backend": "vLLM",
            "error": False
        }

        if "throughput" in fname:
            # Throughput run
            # data has "tokens_per_second"
            tps = data.get("tokens_per_second", 0)

            run = base_run.copy()
            run["test"] = "Throughput"
            run["tps_mean"] = tps
            # If tps is 0 or missing, it might be an error?
            if tps == 0 and "error" in str(data).lower():
                run["error"] = True

            runs.append(run)

        elif "latency" in fname:
            # Latency run
            # raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms"
            raw = data.get("raw_output", "")
            qps_match = re.search(r"_qps([\d\.]+)_", fname)
            qps = qps_match.group(1) if qps_match else "?"

            # Extract metrics
            ttft = 0.0
            tpot = 0.0

            ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
            if ttft_m: ttft = float(ttft_m.group(1))

            tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
            if tpot_m: tpot = float(tpot_m.group(1))

            # We create TWO entries? Or how does the webapp handle multiple metrics?
            # Example webapp table columns are "Backends" showing ONE value.
            # But grouping is by "Test".
            # So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)"

            # Entry 1: TTFT
            r1 = base_run.copy()
            r1["test"] = f"TTFT @ QPS {qps}"
            r1["tps_mean"] = ttft # Using tps_mean field for the numeric value
            runs.append(r1)

            # Entry 2: TPOT
            r2 = base_run.copy()
            r2["test"] = f"TPOT @ QPS {qps}"
            r2["tps_mean"] = tpot
            runs.append(r2)

    return runs

if __name__ == "__main__":
    data = {"runs": parse_logs()}

    runs_count = len(data["runs"])
    print(f"Parsed {runs_count} runs.")

    with open(OUTPUT_FILE, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Written to {OUTPUT_FILE}")