added ROCm/Triton attention comparison
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1237.550695703001,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.16160954108339642,
|
||||
"tokens_per_second": 118.62544339374007
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 540.6128817510034,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.36995048906754757,
|
||||
"tokens_per_second": 275.34859975563967
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 455.23138687500614,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.43933701797875907,
|
||||
"tokens_per_second": 320.4458308584372
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1279.5375675789983,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.15630646967124087,
|
||||
"tokens_per_second": 114.91339037290285
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 460.97370730798866,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.43386422442175154,
|
||||
"tokens_per_second": 318.9683005103833
|
||||
}
|
||||
+17
-7
@@ -529,16 +529,16 @@
|
||||
name: modelName,
|
||||
quant: run.quant,
|
||||
params: run.params_b || run.name_params_b,
|
||||
tp1: null,
|
||||
tp2: null
|
||||
triton: null,
|
||||
rocm: null
|
||||
};
|
||||
}
|
||||
|
||||
const m = testGroups[run.test].models[modelName];
|
||||
|
||||
// Assign TP value
|
||||
if (run.env === "TP1") m.tp1 = run.tps_mean;
|
||||
if (run.env === "TP2") m.tp2 = run.tps_mean;
|
||||
// Assign Backend value
|
||||
if (run.backend === "Triton") m.triton = run.tps_mean;
|
||||
if (run.backend === "ROCm") m.rocm = run.tps_mean;
|
||||
});
|
||||
|
||||
// Convert map to array for sorting
|
||||
@@ -681,7 +681,8 @@
|
||||
thead.innerHTML = `
|
||||
<tr>
|
||||
<th class="col-model">Model</th>
|
||||
<th class="col-data">TP1</th>
|
||||
<th class="col-data">Triton Attention</th>
|
||||
<th class="col-data">ROCm Attention</th>
|
||||
</tr>
|
||||
`;
|
||||
table.appendChild(thead);
|
||||
@@ -698,7 +699,15 @@
|
||||
// Values
|
||||
// Pass unit from meta
|
||||
const unit = meta ? meta.unit : "";
|
||||
const val1 = formatVal(m.tp1, unit);
|
||||
const val1 = formatVal(m.triton, unit);
|
||||
|
||||
// Special handling for ROCm column where we want 'X' for crashes/missing if Triton has data
|
||||
let val2;
|
||||
if ((m.rocm === null || m.rocm === 0) && m.triton > 0) {
|
||||
val2 = '<span class="val-na" style="color: #ef4444; font-weight:bold;">X</span>';
|
||||
} else {
|
||||
val2 = formatVal(m.rocm, unit);
|
||||
}
|
||||
|
||||
tr.innerHTML = `
|
||||
<td>
|
||||
@@ -708,6 +717,7 @@
|
||||
</div>
|
||||
</td>
|
||||
<td class="col-data">${val1}</td>
|
||||
<td class="col-data">${val2}</td>
|
||||
`;
|
||||
tbody.appendChild(tr);
|
||||
});
|
||||
|
||||
+85
-132
@@ -4,16 +4,17 @@ import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
# Config
|
||||
BENCHMARK_DIR = Path("../benchmarks/benchmark_results")
|
||||
OUTPUT_FILE = Path("results.json")
|
||||
SCRIPT_DIR = Path(__file__).parent.resolve()
|
||||
BENCHMARK_SOURCES = {
|
||||
"Triton": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results",
|
||||
"ROCm": SCRIPT_DIR.parent / "benchmarks" / "benchmark_results_rocm_attn" / "benchmark_results"
|
||||
}
|
||||
OUTPUT_FILE = SCRIPT_DIR / "results.json"
|
||||
|
||||
# Regex to parse model name for quantization and parameters
|
||||
# Examples:
|
||||
# "meta-llama/Meta-Llama-3.1-8B-In
|
||||
# struct"
|
||||
# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit"
|
||||
# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block"
|
||||
PARAMS_REGEX = r"(\d+(?:\.\d+)?)B"
|
||||
QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)"
|
||||
|
||||
@@ -24,7 +25,8 @@ def extract_meta(model_name):
|
||||
|
||||
# Quant
|
||||
quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE)
|
||||
quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown.
|
||||
quant = quant_match.group(1).upper() if quant_match else "BF16"
|
||||
|
||||
# Refine quant if 4bit
|
||||
if quant == "4BIT" or quant == "INT4":
|
||||
if "GPTQ" in model_name: quant = "GPTQ-4bit"
|
||||
@@ -36,137 +38,88 @@ def extract_meta(model_name):
|
||||
def parse_logs():
|
||||
runs = []
|
||||
|
||||
if not BENCHMARK_DIR.exists():
|
||||
print(f"Error: {BENCHMARK_DIR} does not exist!")
|
||||
return []
|
||||
|
||||
print(f"Scanning {BENCHMARK_DIR}...")
|
||||
|
||||
# Files are flat in the dir: {model_safe}_tp{tp}_{type}.json
|
||||
# or latency: {model_safe}_tp{tp}_qps{q}_latency.json
|
||||
|
||||
# We need to group by (model, tp) to form cohesive records if we want,
|
||||
# BUT the webapp expects a list of "runs".
|
||||
# Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384")
|
||||
# Actually, looking at the provided valid example:
|
||||
# "test": "pp512", "tps_mean": 2708.86 ...
|
||||
|
||||
# Our data:
|
||||
# throughput.json -> tokens_per_second. This is usually "decoding" or a mix?
|
||||
# vLLM bench throughput usually streams tokens.
|
||||
# Let's look at what run_vllm_bench.py produces.
|
||||
# Throughput: --input-len 1024 --output-len 512.
|
||||
# This is effectively a mixed batch.
|
||||
# We'll label it "Throughput (1024/512)" or just "Throughput"
|
||||
|
||||
# Latency: qps-based.
|
||||
|
||||
files = list(BENCHMARK_DIR.glob("*.json"))
|
||||
|
||||
for f in files:
|
||||
fname = f.name
|
||||
try:
|
||||
data = json.loads(f.read_text())
|
||||
except:
|
||||
print(f"Skipping bad JSON: {fname}")
|
||||
for backend_name, bench_dir in BENCHMARK_SOURCES.items():
|
||||
if not bench_dir.exists():
|
||||
print(f"Warning: {bench_dir} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
# Infer metadata from filename
|
||||
# Format: {model_safe}_tp{tp}_{suffix}
|
||||
# Suffix can be: "throughput.json" or "qps{q}_latency.json"
|
||||
print(f"Scanning {bench_dir} for {backend_name} results...")
|
||||
files = list(bench_dir.glob("*.json"))
|
||||
|
||||
# We need model name. The script replaces / with _ in filenames.
|
||||
# But we verify against the known models list? Or just parse string.
|
||||
# We can reconstruct roughly.
|
||||
|
||||
# Split by "_tp" which is a strong delimiter
|
||||
parts = fname.split("_tp")
|
||||
if len(parts) < 2: continue
|
||||
|
||||
model_part = parts[0]
|
||||
rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json"
|
||||
|
||||
# TP
|
||||
tp_match = re.match(r"^(\d+)", rest)
|
||||
if not tp_match: continue
|
||||
tp = int(tp_match.group(1))
|
||||
|
||||
# Env mapping
|
||||
env = f"TP{tp}"
|
||||
|
||||
# Model Name Restoration (best effort or matching)
|
||||
# In the script: model.replace("/", "_")
|
||||
# We can reverse this if we have the list, but for now let's just use the clean string?
|
||||
# The webapp uses "model_clean" and "model".
|
||||
# Let's assume standard "org_model" format -> "org/model"
|
||||
if "_" in model_part:
|
||||
# Heuristic: First _ is likely the slash
|
||||
model_display = model_part.replace("_", "/", 1)
|
||||
else:
|
||||
model_display = model_part
|
||||
|
||||
params_b, quant = extract_meta(model_display)
|
||||
|
||||
base_run = {
|
||||
"model": model_display,
|
||||
"model_clean": model_display,
|
||||
"env": env,
|
||||
"gpu_config": "dual" if tp > 1 else "single",
|
||||
"quant": quant,
|
||||
"params_b": params_b,
|
||||
"name_params_b": params_b,
|
||||
# Defaults
|
||||
"backend": "vLLM",
|
||||
"error": False
|
||||
}
|
||||
for f in files:
|
||||
fname = f.name
|
||||
try:
|
||||
data = json.loads(f.read_text())
|
||||
except:
|
||||
print(f"Skipping bad JSON: {fname}")
|
||||
continue
|
||||
|
||||
if "throughput" in fname:
|
||||
# Throughput run
|
||||
# data has "tokens_per_second"
|
||||
tps = data.get("tokens_per_second", 0)
|
||||
# Filename parsing
|
||||
parts = fname.split("_tp")
|
||||
if len(parts) < 2: continue
|
||||
|
||||
run = base_run.copy()
|
||||
run["test"] = "Throughput"
|
||||
run["tps_mean"] = tps
|
||||
# If tps is 0 or missing, it might be an error?
|
||||
if tps == 0 and "error" in str(data).lower():
|
||||
run["error"] = True
|
||||
model_part = parts[0]
|
||||
rest = parts[1] # "1_throughput.json"
|
||||
|
||||
runs.append(run)
|
||||
# TP
|
||||
tp_match = re.match(r"^(\d+)", rest)
|
||||
if not tp_match: continue
|
||||
tp = int(tp_match.group(1))
|
||||
|
||||
# Model Name
|
||||
if "_" in model_part:
|
||||
model_display = model_part.replace("_", "/", 1)
|
||||
else:
|
||||
model_display = model_part
|
||||
|
||||
params_b, quant = extract_meta(model_display)
|
||||
|
||||
base_run = {
|
||||
"model": model_display,
|
||||
"model_clean": model_display,
|
||||
"env": f"TP{tp}",
|
||||
"gpu_config": "dual" if tp > 1 else "single",
|
||||
"quant": quant,
|
||||
"params_b": params_b,
|
||||
"name_params_b": params_b,
|
||||
"backend": backend_name, # "Triton" or "ROCm"
|
||||
"error": False
|
||||
}
|
||||
|
||||
elif "latency" in fname:
|
||||
# Latency run
|
||||
# raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms"
|
||||
raw = data.get("raw_output", "")
|
||||
qps_match = re.search(r"_qps([\d\.]+)_", fname)
|
||||
qps = qps_match.group(1) if qps_match else "?"
|
||||
|
||||
# Extract metrics
|
||||
ttft = 0.0
|
||||
tpot = 0.0
|
||||
|
||||
ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
|
||||
if ttft_m: ttft = float(ttft_m.group(1))
|
||||
|
||||
tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
|
||||
if tpot_m: tpot = float(tpot_m.group(1))
|
||||
|
||||
# We create TWO entries? Or how does the webapp handle multiple metrics?
|
||||
# Example webapp table columns are "Backends" showing ONE value.
|
||||
# But grouping is by "Test".
|
||||
# So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)"
|
||||
|
||||
# Entry 1: TTFT
|
||||
r1 = base_run.copy()
|
||||
r1["test"] = f"TTFT @ QPS {qps}"
|
||||
r1["tps_mean"] = ttft # Using tps_mean field for the numeric value
|
||||
runs.append(r1)
|
||||
|
||||
# Entry 2: TPOT
|
||||
r2 = base_run.copy()
|
||||
r2["test"] = f"TPOT @ QPS {qps}"
|
||||
r2["tps_mean"] = tpot
|
||||
runs.append(r2)
|
||||
if "throughput" in fname:
|
||||
tps = data.get("tokens_per_second", 0)
|
||||
run = base_run.copy()
|
||||
run["test"] = "Throughput"
|
||||
run["tps_mean"] = tps
|
||||
if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump
|
||||
run["error"] = True
|
||||
runs.append(run)
|
||||
|
||||
elif "latency" in fname:
|
||||
raw = data.get("raw_output", "")
|
||||
qps_match = re.search(r"_qps([\d\.]+)_", fname)
|
||||
qps = qps_match.group(1) if qps_match else "?"
|
||||
|
||||
ttft = 0.0
|
||||
tpot = 0.0
|
||||
|
||||
ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw)
|
||||
if ttft_m: ttft = float(ttft_m.group(1))
|
||||
|
||||
tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw)
|
||||
if tpot_m: tpot = float(tpot_m.group(1))
|
||||
|
||||
# TTFT
|
||||
r1 = base_run.copy()
|
||||
r1["test"] = f"TTFT @ QPS {qps}"
|
||||
r1["tps_mean"] = ttft
|
||||
runs.append(r1)
|
||||
|
||||
# TPOT
|
||||
r2 = base_run.copy()
|
||||
r2["test"] = f"TPOT @ QPS {qps}"
|
||||
r2["tps_mean"] = tpot
|
||||
runs.append(r2)
|
||||
|
||||
return runs
|
||||
|
||||
|
||||
+72
-7
@@ -8,7 +8,7 @@
|
||||
"quant": "AWQ",
|
||||
"params_b": 14.0,
|
||||
"name_params_b": 14.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 112.69232830266365
|
||||
@@ -21,7 +21,7 @@
|
||||
"quant": "BF16",
|
||||
"params_b": 8.0,
|
||||
"name_params_b": 8.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 278.99494393048457
|
||||
@@ -34,7 +34,7 @@
|
||||
"quant": "BF16",
|
||||
"params_b": 12.0,
|
||||
"name_params_b": 12.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 162.71078485804028
|
||||
@@ -47,7 +47,7 @@
|
||||
"quant": "GPTQ",
|
||||
"params_b": 80.0,
|
||||
"name_params_b": 80.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 112.62418795067208
|
||||
@@ -60,7 +60,7 @@
|
||||
"quant": "BF16",
|
||||
"params_b": 20.0,
|
||||
"name_params_b": 20.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 313.85817605876395
|
||||
@@ -73,7 +73,7 @@
|
||||
"quant": "GPTQ",
|
||||
"params_b": 30.0,
|
||||
"name_params_b": 30.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 271.7264154071495
|
||||
@@ -86,10 +86,75 @@
|
||||
"quant": "BF16",
|
||||
"params_b": 120.0,
|
||||
"name_params_b": 120.0,
|
||||
"backend": "vLLM",
|
||||
"backend": "Triton",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 109.73523843987172
|
||||
},
|
||||
{
|
||||
"model": "Qwen/Qwen3-14B-AWQ",
|
||||
"model_clean": "Qwen/Qwen3-14B-AWQ",
|
||||
"env": "TP1",
|
||||
"gpu_config": "single",
|
||||
"quant": "AWQ",
|
||||
"params_b": 14.0,
|
||||
"name_params_b": 14.0,
|
||||
"backend": "ROCm",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 118.62544339374007
|
||||
},
|
||||
{
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"env": "TP1",
|
||||
"gpu_config": "single",
|
||||
"quant": "BF16",
|
||||
"params_b": 8.0,
|
||||
"name_params_b": 8.0,
|
||||
"backend": "ROCm",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 320.4458308584372
|
||||
},
|
||||
{
|
||||
"model": "google/gemma-3-12b-it",
|
||||
"model_clean": "google/gemma-3-12b-it",
|
||||
"env": "TP1",
|
||||
"gpu_config": "single",
|
||||
"quant": "BF16",
|
||||
"params_b": 12.0,
|
||||
"name_params_b": 12.0,
|
||||
"backend": "ROCm",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 275.34859975563967
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-oss-20b",
|
||||
"model_clean": "openai/gpt-oss-20b",
|
||||
"env": "TP1",
|
||||
"gpu_config": "single",
|
||||
"quant": "BF16",
|
||||
"params_b": 20.0,
|
||||
"name_params_b": 20.0,
|
||||
"backend": "ROCm",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 318.9683005103833
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-oss-120b",
|
||||
"model_clean": "openai/gpt-oss-120b",
|
||||
"env": "TP1",
|
||||
"gpu_config": "single",
|
||||
"quant": "BF16",
|
||||
"params_b": 120.0,
|
||||
"name_params_b": 120.0,
|
||||
"backend": "ROCm",
|
||||
"error": false,
|
||||
"test": "Throughput",
|
||||
"tps_mean": 114.91339037290285
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"runs": []
|
||||
}
|
||||
Αναφορά σε νέο ζήτημα
Block a user