From 711de530f6ff2fdc3d0bfeee70d538d83d1c7cf2 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sat, 20 Dec 2025 11:49:03 +0000 Subject: [PATCH] added ROCm/Triton attention comparison --- .../Qwen_Qwen3-14B-AWQ_tp1_throughput.json | 7 + .../google_gemma-3-12b-it_tp1_throughput.json | 7 + ...-Llama-3.1-8B-Instruct_tp1_throughput.json | 7 + .../openai_gpt-oss-120b_tp1_throughput.json | 7 + .../openai_gpt-oss-20b_tp1_throughput.json | 7 + docs/index.html | 24 +- docs/parse_results.py | 217 +++++++----------- docs/results.json | 79 ++++++- results.json | 3 + 9 files changed, 212 insertions(+), 146 deletions(-) create mode 100644 benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json create mode 100644 benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json create mode 100644 benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json create mode 100644 benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json create mode 100644 benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json create mode 100644 results.json diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json new file mode 100644 index 0000000..b622487 --- /dev/null +++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1237.550695703001, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.16160954108339642, + "tokens_per_second": 118.62544339374007 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json new file mode 100644 index 0000000..0b96338 --- /dev/null +++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 540.6128817510034, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.36995048906754757, + "tokens_per_second": 275.34859975563967 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json new file mode 100644 index 0000000..969237d --- /dev/null +++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 455.23138687500614, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.43933701797875907, + "tokens_per_second": 320.4458308584372 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json new file mode 100644 index 0000000..6918030 --- /dev/null +++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1279.5375675789983, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.15630646967124087, + "tokens_per_second": 114.91339037290285 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json new file mode 100644 index 0000000..f16a219 --- /dev/null +++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 460.97370730798866, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.43386422442175154, + "tokens_per_second": 318.9683005103833 +} \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 250da66..0d9a343 100644 --- a/docs/index.html +++ b/docs/index.html @@ -529,16 +529,16 @@ name: modelName, quant: run.quant, params: run.params_b || run.name_params_b, - tp1: null, - tp2: null + triton: null, + rocm: null }; } const m = testGroups[run.test].models[modelName]; - // Assign TP value - if (run.env === "TP1") m.tp1 = run.tps_mean; - if (run.env === "TP2") m.tp2 = run.tps_mean; + // Assign Backend value + if (run.backend === "Triton") m.triton = run.tps_mean; + if (run.backend === "ROCm") m.rocm = run.tps_mean; }); // Convert map to array for sorting @@ -681,7 +681,8 @@ thead.innerHTML = ` Model - TP1 + Triton Attention + ROCm Attention `; table.appendChild(thead); @@ -698,7 +699,15 @@ // Values // Pass unit from meta const unit = meta ? meta.unit : ""; - const val1 = formatVal(m.tp1, unit); + const val1 = formatVal(m.triton, unit); + + // Special handling for ROCm column where we want 'X' for crashes/missing if Triton has data + let val2; + if ((m.rocm === null || m.rocm === 0) && m.triton > 0) { + val2 = 'X'; + } else { + val2 = formatVal(m.rocm, unit); + } tr.innerHTML = ` @@ -708,6 +717,7 @@ ${val1} + ${val2} `; tbody.appendChild(tr); }); diff --git a/docs/parse_results.py b/docs/parse_results.py index 5a7dc81..83e4cb9 100644 --- a/docs/parse_results.py +++ b/docs/parse_results.py @@ -4,16 +4,17 @@ import json import re from pathlib import Path + + # Config -BENCHMARK_DIR = Path("../benchmarks/benchmark_results") -OUTPUT_FILE = Path("results.json") +SCRIPT_DIR = Path(__file__).parent.resolve() +BENCHMARK_SOURCES = { + "Triton": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results", + "ROCm": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results_rocm_attn" / "benchmark_results" +} +OUTPUT_FILE = SCRIPT_DIR / "results.json" # Regex to parse model name for quantization and parameters -# Examples: -# "meta-llama/Meta-Llama-3.1-8B-In -# struct" -# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit" -# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block" PARAMS_REGEX = r"(\d+(?:\.\d+)?)B" QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)" @@ -24,7 +25,8 @@ def extract_meta(model_name): # Quant quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE) - quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown. + quant = quant_match.group(1).upper() if quant_match else "BF16" + # Refine quant if 4bit if quant == "4BIT" or quant == "INT4": if "GPTQ" in model_name: quant = "GPTQ-4bit" @@ -36,137 +38,88 @@ def extract_meta(model_name): def parse_logs(): runs = [] - if not BENCHMARK_DIR.exists(): - print(f"Error: {BENCHMARK_DIR} does not exist!") - return [] - - print(f"Scanning {BENCHMARK_DIR}...") - - # Files are flat in the dir: {model_safe}_tp{tp}_{type}.json - # or latency: {model_safe}_tp{tp}_qps{q}_latency.json - - # We need to group by (model, tp) to form cohesive records if we want, - # BUT the webapp expects a list of "runs". - # Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384") - # Actually, looking at the provided valid example: - # "test": "pp512", "tps_mean": 2708.86 ... - - # Our data: - # throughput.json -> tokens_per_second. This is usually "decoding" or a mix? - # vLLM bench throughput usually streams tokens. - # Let's look at what run_vllm_bench.py produces. - # Throughput: --input-len 1024 --output-len 512. - # This is effectively a mixed batch. - # We'll label it "Throughput (1024/512)" or just "Throughput" - - # Latency: qps-based. - - files = list(BENCHMARK_DIR.glob("*.json")) - - for f in files: - fname = f.name - try: - data = json.loads(f.read_text()) - except: - print(f"Skipping bad JSON: {fname}") + for backend_name, bench_dir in BENCHMARK_SOURCES.items(): + if not bench_dir.exists(): + print(f"Warning: {bench_dir} does not exist, skipping.") continue - # Infer metadata from filename - # Format: {model_safe}_tp{tp}_{suffix} - # Suffix can be: "throughput.json" or "qps{q}_latency.json" + print(f"Scanning {bench_dir} for {backend_name} results...") + files = list(bench_dir.glob("*.json")) - # We need model name. The script replaces / with _ in filenames. - # But we verify against the known models list? Or just parse string. - # We can reconstruct roughly. - - # Split by "_tp" which is a strong delimiter - parts = fname.split("_tp") - if len(parts) < 2: continue - - model_part = parts[0] - rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json" - - # TP - tp_match = re.match(r"^(\d+)", rest) - if not tp_match: continue - tp = int(tp_match.group(1)) - - # Env mapping - env = f"TP{tp}" - - # Model Name Restoration (best effort or matching) - # In the script: model.replace("/", "_") - # We can reverse this if we have the list, but for now let's just use the clean string? - # The webapp uses "model_clean" and "model". - # Let's assume standard "org_model" format -> "org/model" - if "_" in model_part: - # Heuristic: First _ is likely the slash - model_display = model_part.replace("_", "/", 1) - else: - model_display = model_part - - params_b, quant = extract_meta(model_display) - - base_run = { - "model": model_display, - "model_clean": model_display, - "env": env, - "gpu_config": "dual" if tp > 1 else "single", - "quant": quant, - "params_b": params_b, - "name_params_b": params_b, - # Defaults - "backend": "vLLM", - "error": False - } + for f in files: + fname = f.name + try: + data = json.loads(f.read_text()) + except: + print(f"Skipping bad JSON: {fname}") + continue - if "throughput" in fname: - # Throughput run - # data has "tokens_per_second" - tps = data.get("tokens_per_second", 0) + # Filename parsing + parts = fname.split("_tp") + if len(parts) < 2: continue - run = base_run.copy() - run["test"] = "Throughput" - run["tps_mean"] = tps - # If tps is 0 or missing, it might be an error? - if tps == 0 and "error" in str(data).lower(): - run["error"] = True + model_part = parts[0] + rest = parts[1] # "1_throughput.json" - runs.append(run) + # TP + tp_match = re.match(r"^(\d+)", rest) + if not tp_match: continue + tp = int(tp_match.group(1)) + + # Model Name + if "_" in model_part: + model_display = model_part.replace("_", "/", 1) + else: + model_display = model_part + + params_b, quant = extract_meta(model_display) + + base_run = { + "model": model_display, + "model_clean": model_display, + "env": f"TP{tp}", + "gpu_config": "dual" if tp > 1 else "single", + "quant": quant, + "params_b": params_b, + "name_params_b": params_b, + "backend": backend_name, # "Triton" or "ROCm" + "error": False + } - elif "latency" in fname: - # Latency run - # raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms" - raw = data.get("raw_output", "") - qps_match = re.search(r"_qps([\d\.]+)_", fname) - qps = qps_match.group(1) if qps_match else "?" - - # Extract metrics - ttft = 0.0 - tpot = 0.0 - - ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw) - if ttft_m: ttft = float(ttft_m.group(1)) - - tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw) - if tpot_m: tpot = float(tpot_m.group(1)) - - # We create TWO entries? Or how does the webapp handle multiple metrics? - # Example webapp table columns are "Backends" showing ONE value. - # But grouping is by "Test". - # So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)" - - # Entry 1: TTFT - r1 = base_run.copy() - r1["test"] = f"TTFT @ QPS {qps}" - r1["tps_mean"] = ttft # Using tps_mean field for the numeric value - runs.append(r1) - - # Entry 2: TPOT - r2 = base_run.copy() - r2["test"] = f"TPOT @ QPS {qps}" - r2["tps_mean"] = tpot - runs.append(r2) + if "throughput" in fname: + tps = data.get("tokens_per_second", 0) + run = base_run.copy() + run["test"] = "Throughput" + run["tps_mean"] = tps + if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump + run["error"] = True + runs.append(run) + + elif "latency" in fname: + raw = data.get("raw_output", "") + qps_match = re.search(r"_qps([\d\.]+)_", fname) + qps = qps_match.group(1) if qps_match else "?" + + ttft = 0.0 + tpot = 0.0 + + ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw) + if ttft_m: ttft = float(ttft_m.group(1)) + + tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw) + if tpot_m: tpot = float(tpot_m.group(1)) + + # TTFT + r1 = base_run.copy() + r1["test"] = f"TTFT @ QPS {qps}" + r1["tps_mean"] = ttft + runs.append(r1) + + # TPOT + r2 = base_run.copy() + r2["test"] = f"TPOT @ QPS {qps}" + r2["tps_mean"] = tpot + runs.append(r2) return runs diff --git a/docs/results.json b/docs/results.json index 8d17292..f106554 100644 --- a/docs/results.json +++ b/docs/results.json @@ -8,7 +8,7 @@ "quant": "AWQ", "params_b": 14.0, "name_params_b": 14.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 112.69232830266365 @@ -21,7 +21,7 @@ "quant": "BF16", "params_b": 8.0, "name_params_b": 8.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 278.99494393048457 @@ -34,7 +34,7 @@ "quant": "BF16", "params_b": 12.0, "name_params_b": 12.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 162.71078485804028 @@ -47,7 +47,7 @@ "quant": "GPTQ", "params_b": 80.0, "name_params_b": 80.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 112.62418795067208 @@ -60,7 +60,7 @@ "quant": "BF16", "params_b": 20.0, "name_params_b": 20.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 313.85817605876395 @@ -73,7 +73,7 @@ "quant": "GPTQ", "params_b": 30.0, "name_params_b": 30.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 271.7264154071495 @@ -86,10 +86,75 @@ "quant": "BF16", "params_b": 120.0, "name_params_b": 120.0, - "backend": "vLLM", + "backend": "Triton", "error": false, "test": "Throughput", "tps_mean": 109.73523843987172 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP1", + "gpu_config": "single", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "ROCm", + "error": false, + "test": "Throughput", + "tps_mean": 118.62544339374007 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "ROCm", + "error": false, + "test": "Throughput", + "tps_mean": 320.4458308584372 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "ROCm", + "error": false, + "test": "Throughput", + "tps_mean": 275.34859975563967 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "ROCm", + "error": false, + "test": "Throughput", + "tps_mean": 318.9683005103833 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "ROCm", + "error": false, + "test": "Throughput", + "tps_mean": 114.91339037290285 } ] } \ No newline at end of file diff --git a/results.json b/results.json new file mode 100644 index 0000000..23e07c0 --- /dev/null +++ b/results.json @@ -0,0 +1,3 @@ +{ + "runs": [] +} \ No newline at end of file