diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
new file mode 100644
index 0000000..b622487
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json
@@ -0,0 +1,7 @@
+{
+ "elapsed_time": 1237.550695703001,
+ "num_requests": 200,
+ "total_num_tokens": 146805,
+ "requests_per_second": 0.16160954108339642,
+ "tokens_per_second": 118.62544339374007
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json
new file mode 100644
index 0000000..0b96338
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json
@@ -0,0 +1,7 @@
+{
+ "elapsed_time": 540.6128817510034,
+ "num_requests": 200,
+ "total_num_tokens": 148857,
+ "requests_per_second": 0.36995048906754757,
+ "tokens_per_second": 275.34859975563967
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
new file mode 100644
index 0000000..969237d
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json
@@ -0,0 +1,7 @@
+{
+ "elapsed_time": 455.23138687500614,
+ "num_requests": 200,
+ "total_num_tokens": 145877,
+ "requests_per_second": 0.43933701797875907,
+ "tokens_per_second": 320.4458308584372
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json
new file mode 100644
index 0000000..6918030
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json
@@ -0,0 +1,7 @@
+{
+ "elapsed_time": 1279.5375675789983,
+ "num_requests": 200,
+ "total_num_tokens": 147036,
+ "requests_per_second": 0.15630646967124087,
+ "tokens_per_second": 114.91339037290285
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json
new file mode 100644
index 0000000..f16a219
--- /dev/null
+++ b/benchmarks/benchmark_results_rocm_attn/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json
@@ -0,0 +1,7 @@
+{
+ "elapsed_time": 460.97370730798866,
+ "num_requests": 200,
+ "total_num_tokens": 147036,
+ "requests_per_second": 0.43386422442175154,
+ "tokens_per_second": 318.9683005103833
+}
\ No newline at end of file
diff --git a/docs/index.html b/docs/index.html
index 250da66..0d9a343 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -529,16 +529,16 @@
name: modelName,
quant: run.quant,
params: run.params_b || run.name_params_b,
- tp1: null,
- tp2: null
+ triton: null,
+ rocm: null
};
}
const m = testGroups[run.test].models[modelName];
- // Assign TP value
- if (run.env === "TP1") m.tp1 = run.tps_mean;
- if (run.env === "TP2") m.tp2 = run.tps_mean;
+ // Assign Backend value
+ if (run.backend === "Triton") m.triton = run.tps_mean;
+ if (run.backend === "ROCm") m.rocm = run.tps_mean;
});
// Convert map to array for sorting
@@ -681,7 +681,8 @@
thead.innerHTML = `
| Model |
- TP1 |
+ Triton Attention |
+ ROCm Attention |
`;
table.appendChild(thead);
@@ -698,7 +699,15 @@
// Values
// Pass unit from meta
const unit = meta ? meta.unit : "";
- const val1 = formatVal(m.tp1, unit);
+ const val1 = formatVal(m.triton, unit);
+
+ // Special handling for ROCm column where we want 'X' for crashes/missing if Triton has data
+ let val2;
+ if ((m.rocm === null || m.rocm === 0) && m.triton > 0) {
+ val2 = 'X';
+ } else {
+ val2 = formatVal(m.rocm, unit);
+ }
tr.innerHTML = `
@@ -708,6 +717,7 @@
|
${val1} |
+ ${val2} |
`;
tbody.appendChild(tr);
});
diff --git a/docs/parse_results.py b/docs/parse_results.py
index 5a7dc81..83e4cb9 100644
--- a/docs/parse_results.py
+++ b/docs/parse_results.py
@@ -4,16 +4,17 @@ import json
import re
from pathlib import Path
+
+
# Config
-BENCHMARK_DIR = Path("../benchmarks/benchmark_results")
-OUTPUT_FILE = Path("results.json")
+SCRIPT_DIR = Path(__file__).parent.resolve()
+BENCHMARK_SOURCES = {
+ "Triton": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results",
+ "ROCm": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results_rocm_attn" / "benchmark_results"
+}
+OUTPUT_FILE = SCRIPT_DIR / "results.json"
# Regex to parse model name for quantization and parameters
-# Examples:
-# "meta-llama/Meta-Llama-3.1-8B-In
-# struct"
-# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit"
-# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block"
PARAMS_REGEX = r"(\d+(?:\.\d+)?)B"
QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)"
@@ -24,7 +25,8 @@ def extract_meta(model_name):
# Quant
quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE)
- quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown.
+ quant = quant_match.group(1).upper() if quant_match else "BF16"
+
# Refine quant if 4bit
if quant == "4BIT" or quant == "INT4":
if "GPTQ" in model_name: quant = "GPTQ-4bit"
@@ -36,137 +38,88 @@ def extract_meta(model_name):
def parse_logs():
runs = []
- if not BENCHMARK_DIR.exists():
- print(f"Error: {BENCHMARK_DIR} does not exist!")
- return []
-
- print(f"Scanning {BENCHMARK_DIR}...")
-
- # Files are flat in the dir: {model_safe}_tp{tp}_{type}.json
- # or latency: {model_safe}_tp{tp}_qps{q}_latency.json
-
- # We need to group by (model, tp) to form cohesive records if we want,
- # BUT the webapp expects a list of "runs".
- # Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384")
- # Actually, looking at the provided valid example:
- # "test": "pp512", "tps_mean": 2708.86 ...
-
- # Our data:
- # throughput.json -> tokens_per_second. This is usually "decoding" or a mix?
- # vLLM bench throughput usually streams tokens.
- # Let's look at what run_vllm_bench.py produces.
- # Throughput: --input-len 1024 --output-len 512.
- # This is effectively a mixed batch.
- # We'll label it "Throughput (1024/512)" or just "Throughput"
-
- # Latency: qps-based.
-
- files = list(BENCHMARK_DIR.glob("*.json"))
-
- for f in files:
- fname = f.name
- try:
- data = json.loads(f.read_text())
- except:
- print(f"Skipping bad JSON: {fname}")
+ for backend_name, bench_dir in BENCHMARK_SOURCES.items():
+ if not bench_dir.exists():
+ print(f"Warning: {bench_dir} does not exist, skipping.")
continue
- # Infer metadata from filename
- # Format: {model_safe}_tp{tp}_{suffix}
- # Suffix can be: "throughput.json" or "qps{q}_latency.json"
+ print(f"Scanning {bench_dir} for {backend_name} results...")
+ files = list(bench_dir.glob("*.json"))
- # We need model name. The script replaces / with _ in filenames.
- # But we verify against the known models list? Or just parse string.
- # We can reconstruct roughly.
-
- # Split by "_tp" which is a strong delimiter
- parts = fname.split("_tp")
- if len(parts) < 2: continue
-
- model_part = parts[0]
- rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json"
-
- # TP
- tp_match = re.match(r"^(\d+)", rest)
- if not tp_match: continue
- tp = int(tp_match.group(1))
-
- # Env mapping
- env = f"TP{tp}"
-
- # Model Name Restoration (best effort or matching)
- # In the script: model.replace("/", "_")
- # We can reverse this if we have the list, but for now let's just use the clean string?
- # The webapp uses "model_clean" and "model".
- # Let's assume standard "org_model" format -> "org/model"
- if "_" in model_part:
- # Heuristic: First _ is likely the slash
- model_display = model_part.replace("_", "/", 1)
- else:
- model_display = model_part
-
- params_b, quant = extract_meta(model_display)
-
- base_run = {
- "model": model_display,
- "model_clean": model_display,
- "env": env,
- "gpu_config": "dual" if tp > 1 else "single",
- "quant": quant,
- "params_b": params_b,
- "name_params_b": params_b,
- # Defaults
- "backend": "vLLM",
- "error": False
- }
+ for f in files:
+ fname = f.name
+ try:
+ data = json.loads(f.read_text())
+ except:
+ print(f"Skipping bad JSON: {fname}")
+ continue
- if "throughput" in fname:
- # Throughput run
- # data has "tokens_per_second"
- tps = data.get("tokens_per_second", 0)
+ # Filename parsing
+ parts = fname.split("_tp")
+ if len(parts) < 2: continue
- run = base_run.copy()
- run["test"] = "Throughput"
- run["tps_mean"] = tps
- # If tps is 0 or missing, it might be an error?
- if tps == 0 and "error" in str(data).lower():
- run["error"] = True
+ model_part = parts[0]
+ rest = parts[1] # "1_throughput.json"
- runs.append(run)
+ # TP
+ tp_match = re.match(r"^(\d+)", rest)
+ if not tp_match: continue
+ tp = int(tp_match.group(1))
+
+ # Model Name
+ if "_" in model_part:
+ model_display = model_part.replace("_", "/", 1)
+ else:
+ model_display = model_part
+
+ params_b, quant = extract_meta(model_display)
+
+ base_run = {
+ "model": model_display,
+ "model_clean": model_display,
+ "env": f"TP{tp}",
+ "gpu_config": "dual" if tp > 1 else "single",
+ "quant": quant,
+ "params_b": params_b,
+ "name_params_b": params_b,
+ "backend": backend_name, # "Triton" or "ROCm"
+ "error": False
+ }
- elif "latency" in fname:
- # Latency run
- # raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms"
- raw = data.get("raw_output", "")
- qps_match = re.search(r"_qps([\d\.]+)_", fname)
- qps = qps_match.group(1) if qps_match else "?"
-
- # Extract metrics
- ttft = 0.0
- tpot = 0.0
-
- ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
- if ttft_m: ttft = float(ttft_m.group(1))
-
- tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
- if tpot_m: tpot = float(tpot_m.group(1))
-
- # We create TWO entries? Or how does the webapp handle multiple metrics?
- # Example webapp table columns are "Backends" showing ONE value.
- # But grouping is by "Test".
- # So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)"
-
- # Entry 1: TTFT
- r1 = base_run.copy()
- r1["test"] = f"TTFT @ QPS {qps}"
- r1["tps_mean"] = ttft # Using tps_mean field for the numeric value
- runs.append(r1)
-
- # Entry 2: TPOT
- r2 = base_run.copy()
- r2["test"] = f"TPOT @ QPS {qps}"
- r2["tps_mean"] = tpot
- runs.append(r2)
+ if "throughput" in fname:
+ tps = data.get("tokens_per_second", 0)
+ run = base_run.copy()
+ run["test"] = "Throughput"
+ run["tps_mean"] = tps
+ if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump
+ run["error"] = True
+ runs.append(run)
+
+ elif "latency" in fname:
+ raw = data.get("raw_output", "")
+ qps_match = re.search(r"_qps([\d\.]+)_", fname)
+ qps = qps_match.group(1) if qps_match else "?"
+
+ ttft = 0.0
+ tpot = 0.0
+
+ ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
+ if ttft_m: ttft = float(ttft_m.group(1))
+
+ tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
+ if tpot_m: tpot = float(tpot_m.group(1))
+
+ # TTFT
+ r1 = base_run.copy()
+ r1["test"] = f"TTFT @ QPS {qps}"
+ r1["tps_mean"] = ttft
+ runs.append(r1)
+
+ # TPOT
+ r2 = base_run.copy()
+ r2["test"] = f"TPOT @ QPS {qps}"
+ r2["tps_mean"] = tpot
+ runs.append(r2)
return runs
diff --git a/docs/results.json b/docs/results.json
index 8d17292..f106554 100644
--- a/docs/results.json
+++ b/docs/results.json
@@ -8,7 +8,7 @@
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 112.69232830266365
@@ -21,7 +21,7 @@
"quant": "BF16",
"params_b": 8.0,
"name_params_b": 8.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 278.99494393048457
@@ -34,7 +34,7 @@
"quant": "BF16",
"params_b": 12.0,
"name_params_b": 12.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 162.71078485804028
@@ -47,7 +47,7 @@
"quant": "GPTQ",
"params_b": 80.0,
"name_params_b": 80.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 112.62418795067208
@@ -60,7 +60,7 @@
"quant": "BF16",
"params_b": 20.0,
"name_params_b": 20.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 313.85817605876395
@@ -73,7 +73,7 @@
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 271.7264154071495
@@ -86,10 +86,75 @@
"quant": "BF16",
"params_b": 120.0,
"name_params_b": 120.0,
- "backend": "vLLM",
+ "backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 109.73523843987172
+ },
+ {
+ "model": "Qwen/Qwen3-14B-AWQ",
+ "model_clean": "Qwen/Qwen3-14B-AWQ",
+ "env": "TP1",
+ "gpu_config": "single",
+ "quant": "AWQ",
+ "params_b": 14.0,
+ "name_params_b": 14.0,
+ "backend": "ROCm",
+ "error": false,
+ "test": "Throughput",
+ "tps_mean": 118.62544339374007
+ },
+ {
+ "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "env": "TP1",
+ "gpu_config": "single",
+ "quant": "BF16",
+ "params_b": 8.0,
+ "name_params_b": 8.0,
+ "backend": "ROCm",
+ "error": false,
+ "test": "Throughput",
+ "tps_mean": 320.4458308584372
+ },
+ {
+ "model": "google/gemma-3-12b-it",
+ "model_clean": "google/gemma-3-12b-it",
+ "env": "TP1",
+ "gpu_config": "single",
+ "quant": "BF16",
+ "params_b": 12.0,
+ "name_params_b": 12.0,
+ "backend": "ROCm",
+ "error": false,
+ "test": "Throughput",
+ "tps_mean": 275.34859975563967
+ },
+ {
+ "model": "openai/gpt-oss-20b",
+ "model_clean": "openai/gpt-oss-20b",
+ "env": "TP1",
+ "gpu_config": "single",
+ "quant": "BF16",
+ "params_b": 20.0,
+ "name_params_b": 20.0,
+ "backend": "ROCm",
+ "error": false,
+ "test": "Throughput",
+ "tps_mean": 318.9683005103833
+ },
+ {
+ "model": "openai/gpt-oss-120b",
+ "model_clean": "openai/gpt-oss-120b",
+ "env": "TP1",
+ "gpu_config": "single",
+ "quant": "BF16",
+ "params_b": 120.0,
+ "name_params_b": 120.0,
+ "backend": "ROCm",
+ "error": false,
+ "test": "Throughput",
+ "tps_mean": 114.91339037290285
}
]
}
\ No newline at end of file
diff --git a/results.json b/results.json
new file mode 100644
index 0000000..23e07c0
--- /dev/null
+++ b/results.json
@@ -0,0 +1,3 @@
+{
+ "runs": []
+}
\ No newline at end of file