added ROCm/Triton attention comparison

2025-12-20 11:49:03 +00:00
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1237.550695703001,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.16160954108339642,
+    "tokens_per_second": 118.62544339374007
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 540.6128817510034,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.36995048906754757,
+    "tokens_per_second": 275.34859975563967
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 455.23138687500614,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.43933701797875907,
+    "tokens_per_second": 320.4458308584372
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1279.5375675789983,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.15630646967124087,
+    "tokens_per_second": 114.91339037290285
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 460.97370730798866,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.43386422442175154,
+    "tokens_per_second": 318.9683005103833
+}
@@ -529,16 +529,16 @@
                        name: modelName,
                        quant: run.quant,
                        params: run.params_b || run.name_params_b,
-                        tp1: null,
-                        tp2: null
+                        triton: null,
+                        rocm: null
                    };
                }

                const m = testGroups[run.test].models[modelName];

-                // Assign TP value
-                if (run.env === "TP1") m.tp1 = run.tps_mean;
-                if (run.env === "TP2") m.tp2 = run.tps_mean;
+                // Assign Backend value
+                if (run.backend === "Triton") m.triton = run.tps_mean;
+                if (run.backend === "ROCm") m.rocm = run.tps_mean;
            });

            // Convert map to array for sorting
@@ -681,7 +681,8 @@
            thead.innerHTML = `
            <tr>
                <th class="col-model">Model</th>
-                <th class="col-data">TP1</th>
+                <th class="col-data">Triton Attention</th>
+                <th class="col-data">ROCm Attention</th>
            </tr>
        `;
            table.appendChild(thead);
@@ -698,7 +699,15 @@
                // Values
                // Pass unit from meta
                const unit = meta ? meta.unit : "";
-                const val1 = formatVal(m.tp1, unit);
+                const val1 = formatVal(m.triton, unit);
+
+                // Special handling for ROCm column where we want 'X' for crashes/missing if Triton has data
+                let val2;
+                if ((m.rocm === null || m.rocm === 0) && m.triton > 0) {
+                    val2 = '<span class="val-na" style="color: #ef4444; font-weight:bold;">X</span>';
+                } else {
+                    val2 = formatVal(m.rocm, unit);
+                }

                tr.innerHTML = `
                <td>
@@ -708,6 +717,7 @@
                    </div>
                </td>
                <td class="col-data">${val1}</td>
+                <td class="col-data">${val2}</td>
            `;
                tbody.appendChild(tr);
            });
@@ -4,16 +4,17 @@ import json
 import re
 from pathlib import Path

+
+
 # Config
-BENCHMARK_DIR = Path("../benchmarks/benchmark_results")
-OUTPUT_FILE = Path("results.json")
+SCRIPT_DIR = Path(__file__).parent.resolve()
+BENCHMARK_SOURCES = {
+    "Triton": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results",
+    "ROCm": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results_rocm_attn" / "benchmark_results"
+}
+OUTPUT_FILE = SCRIPT_DIR / "results.json"

 # Regex to parse model name for quantization and parameters
-# Examples: 
-# "meta-llama/Meta-Llama-3.1-8B-In
-# struct"
-# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit"
-# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block"
 PARAMS_REGEX = r"(\d+(?:\.\d+)?)B"
 QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)"

@@ -24,7 +25,8 @@ def extract_meta(model_name):
    
    # Quant
    quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE)
-    quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown.
+    quant = quant_match.group(1).upper() if quant_match else "BF16"
+    
    # Refine quant if 4bit
    if quant == "4BIT" or quant == "INT4":
        if "GPTQ" in model_name: quant = "GPTQ-4bit"
@@ -36,137 +38,88 @@ def extract_meta(model_name):
 def parse_logs():
    runs = []
    
-    if not BENCHMARK_DIR.exists():
-        print(f"Error: {BENCHMARK_DIR} does not exist!")
-        return []
-
-    print(f"Scanning {BENCHMARK_DIR}...")
-    
-    # Files are flat in the dir: {model_safe}_tp{tp}_{type}.json
-    # or latency: {model_safe}_tp{tp}_qps{q}_latency.json
-    
-    # We need to group by (model, tp) to form cohesive records if we want, 
-    # BUT the webapp expects a list of "runs".
-    # Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384")
-    # Actually, looking at the provided valid example:
-    # "test": "pp512", "tps_mean": 2708.86 ...
-    
-    # Our data:
-    # throughput.json -> tokens_per_second. This is usually "decoding" or a mix?
-    # vLLM bench throughput usually streams tokens. 
-    # Let's look at what run_vllm_bench.py produces.
-    # Throughput: --input-len 1024 --output-len 512.
-    # This is effectively a mixed batch. 
-    # We'll label it "Throughput (1024/512)" or just "Throughput"
-    
-    # Latency: qps-based.
-    
-    files = list(BENCHMARK_DIR.glob("*.json"))
-    
-    for f in files:
-        fname = f.name
-        try:
-            data = json.loads(f.read_text())
-        except:
-            print(f"Skipping bad JSON: {fname}")
+    for backend_name, bench_dir in BENCHMARK_SOURCES.items():
+        if not bench_dir.exists():
+            print(f"Warning: {bench_dir} does not exist, skipping.")
            continue

-        # Infer metadata from filename
-        # Format: {model_safe}_tp{tp}_{suffix}
-        # Suffix can be: "throughput.json" or "qps{q}_latency.json"
+        print(f"Scanning {bench_dir} for {backend_name} results...")
+        files = list(bench_dir.glob("*.json"))
        
-        # We need model name. The script replaces / with _ in filenames.
-        # But we verify against the known models list? Or just parse string.
-        # We can reconstruct roughly.
-        
-        # Split by "_tp" which is a strong delimiter
-        parts = fname.split("_tp")
-        if len(parts) < 2: continue
-        
-        model_part = parts[0]
-        rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json"
-        
-        # TP
-        tp_match = re.match(r"^(\d+)", rest)
-        if not tp_match: continue
-        tp = int(tp_match.group(1))
-        
-        # Env mapping
-        env = f"TP{tp}"
-        
-        # Model Name Restoration (best effort or matching)
-        # In the script: model.replace("/", "_")
-        # We can reverse this if we have the list, but for now let's just use the clean string?
-        # The webapp uses "model_clean" and "model".
-        # Let's assume standard "org_model" format -> "org/model"
-        if "_" in model_part:
-            # Heuristic: First _ is likely the slash
-            model_display = model_part.replace("_", "/", 1)
-        else:
-            model_display = model_part
-            
-        params_b, quant = extract_meta(model_display)
-        
-        base_run = {
-            "model": model_display,
-            "model_clean": model_display,
-            "env": env,
-            "gpu_config": "dual" if tp > 1 else "single",
-            "quant": quant,
-            "params_b": params_b,
-            "name_params_b": params_b,
-            # Defaults
-            "backend": "vLLM", 
-            "error": False
-        }
+        for f in files:
+            fname = f.name
+            try:
+                data = json.loads(f.read_text())
+            except:
+                print(f"Skipping bad JSON: {fname}")
+                continue

-        if "throughput" in fname:
-            # Throughput run
-            # data has "tokens_per_second"
-            tps = data.get("tokens_per_second", 0)
+            # Filename parsing
+            parts = fname.split("_tp")
+            if len(parts) < 2: continue
            
-            run = base_run.copy()
-            run["test"] = "Throughput"
-            run["tps_mean"] = tps
-            # If tps is 0 or missing, it might be an error?
-            if tps == 0 and "error" in str(data).lower():
-                run["error"] = True
+            model_part = parts[0]
+            rest = parts[1] # "1_throughput.json"
            
-            runs.append(run)
+            # TP
+            tp_match = re.match(r"^(\d+)", rest)
+            if not tp_match: continue
+            tp = int(tp_match.group(1))
+            
+            # Model Name
+            if "_" in model_part:
+                model_display = model_part.replace("_", "/", 1)
+            else:
+                model_display = model_part
+                
+            params_b, quant = extract_meta(model_display)
+            
+            base_run = {
+                "model": model_display,
+                "model_clean": model_display,
+                "env": f"TP{tp}",
+                "gpu_config": "dual" if tp > 1 else "single",
+                "quant": quant,
+                "params_b": params_b,
+                "name_params_b": params_b,
+                "backend": backend_name, # "Triton" or "ROCm"
+                "error": False
+            }

-        elif "latency" in fname:
-            # Latency run
-            # raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms"
-            raw = data.get("raw_output", "")
-            qps_match = re.search(r"_qps([\d\.]+)_", fname)
-            qps = qps_match.group(1) if qps_match else "?"
-            
-            # Extract metrics
-            ttft = 0.0
-            tpot = 0.0
-            
-            ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
-            if ttft_m: ttft = float(ttft_m.group(1))
-            
-            tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
-            if tpot_m: tpot = float(tpot_m.group(1))
-            
-            # We create TWO entries? Or how does the webapp handle multiple metrics?
-            # Example webapp table columns are "Backends" showing ONE value.
-            # But grouping is by "Test". 
-            # So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)"
-            
-            # Entry 1: TTFT
-            r1 = base_run.copy()
-            r1["test"] = f"TTFT @ QPS {qps}"
-            r1["tps_mean"] = ttft # Using tps_mean field for the numeric value
-            runs.append(r1)
-            
-            # Entry 2: TPOT
-            r2 = base_run.copy()
-            r2["test"] = f"TPOT @ QPS {qps}"
-            r2["tps_mean"] = tpot
-            runs.append(r2)
+            if "throughput" in fname:
+                tps = data.get("tokens_per_second", 0)
+                run = base_run.copy()
+                run["test"] = "Throughput"
+                run["tps_mean"] = tps
+                if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump
+                     run["error"] = True
+                runs.append(run)
+
+            elif "latency" in fname:
+                raw = data.get("raw_output", "")
+                qps_match = re.search(r"_qps([\d\.]+)_", fname)
+                qps = qps_match.group(1) if qps_match else "?"
+                
+                ttft = 0.0
+                tpot = 0.0
+                
+                ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
+                if ttft_m: ttft = float(ttft_m.group(1))
+                
+                tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
+                if tpot_m: tpot = float(tpot_m.group(1))
+                
+                # TTFT
+                r1 = base_run.copy()
+                r1["test"] = f"TTFT @ QPS {qps}"
+                r1["tps_mean"] = ttft
+                runs.append(r1)
+                
+                # TPOT
+                r2 = base_run.copy()
+                r2["test"] = f"TPOT @ QPS {qps}"
+                r2["tps_mean"] = tpot
+                runs.append(r2)

    return runs

@@ -8,7 +8,7 @@
      "quant": "AWQ",
      "params_b": 14.0,
      "name_params_b": 14.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 112.69232830266365
@@ -21,7 +21,7 @@
      "quant": "BF16",
      "params_b": 8.0,
      "name_params_b": 8.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 278.99494393048457
@@ -34,7 +34,7 @@
      "quant": "BF16",
      "params_b": 12.0,
      "name_params_b": 12.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 162.71078485804028
@@ -47,7 +47,7 @@
      "quant": "GPTQ",
      "params_b": 80.0,
      "name_params_b": 80.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 112.62418795067208
@@ -60,7 +60,7 @@
      "quant": "BF16",
      "params_b": 20.0,
      "name_params_b": 20.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 313.85817605876395
@@ -73,7 +73,7 @@
      "quant": "GPTQ",
      "params_b": 30.0,
      "name_params_b": 30.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 271.7264154071495
@@ -86,10 +86,75 @@
      "quant": "BF16",
      "params_b": 120.0,
      "name_params_b": 120.0,
-      "backend": "vLLM",
+      "backend": "Triton",
      "error": false,
      "test": "Throughput",
      "tps_mean": 109.73523843987172
+    },
+    {
+      "model": "Qwen/Qwen3-14B-AWQ",
+      "model_clean": "Qwen/Qwen3-14B-AWQ",
+      "env": "TP1",
+      "gpu_config": "single",
+      "quant": "AWQ",
+      "params_b": 14.0,
+      "name_params_b": 14.0,
+      "backend": "ROCm",
+      "error": false,
+      "test": "Throughput",
+      "tps_mean": 118.62544339374007
+    },
+    {
+      "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+      "env": "TP1",
+      "gpu_config": "single",
+      "quant": "BF16",
+      "params_b": 8.0,
+      "name_params_b": 8.0,
+      "backend": "ROCm",
+      "error": false,
+      "test": "Throughput",
+      "tps_mean": 320.4458308584372
+    },
+    {
+      "model": "google/gemma-3-12b-it",
+      "model_clean": "google/gemma-3-12b-it",
+      "env": "TP1",
+      "gpu_config": "single",
+      "quant": "BF16",
+      "params_b": 12.0,
+      "name_params_b": 12.0,
+      "backend": "ROCm",
+      "error": false,
+      "test": "Throughput",
+      "tps_mean": 275.34859975563967
+    },
+    {
+      "model": "openai/gpt-oss-20b",
+      "model_clean": "openai/gpt-oss-20b",
+      "env": "TP1",
+      "gpu_config": "single",
+      "quant": "BF16",
+      "params_b": 20.0,
+      "name_params_b": 20.0,
+      "backend": "ROCm",
+      "error": false,
+      "test": "Throughput",
+      "tps_mean": 318.9683005103833
+    },
+    {
+      "model": "openai/gpt-oss-120b",
+      "model_clean": "openai/gpt-oss-120b",
+      "env": "TP1",
+      "gpu_config": "single",
+      "quant": "BF16",
+      "params_b": 120.0,
+      "name_params_b": 120.0,
+      "backend": "ROCm",
+      "error": false,
+      "test": "Throughput",
+      "tps_mean": 114.91339037290285
    }
  ]
 }
@@ -0,0 +1,3 @@
+{
+  "runs": []
+}