feat: Add new benchmark results for various models and configurations, and update documentation UI with filtering for attention and tensor parallelism.

Esse commit está contido em:
Donato Capitella
2026-02-02 21:30:17 +00:00
commit 4d3b046870
43 arquivos alterados com 859 adições e 361 exclusões
@@ -0,0 +1,7 @@
{
"elapsed_time": 229.17851571500069,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.4363410753753066,
"tokens_per_second": 328.49937859629955
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1302.7062463890015,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.15352655332265747,
"tokens_per_second": 112.69232830266365
"elapsed_time": 899.6009820629988,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.11116039443473733,
"tokens_per_second": 83.68710295019198
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 442.1101265470061,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.2261879880043141,
"tokens_per_second": 170.28562676904787
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 639.3201232059982,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.15641616206061223,
"tokens_per_second": 117.75790760733192
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 577.3050836349939,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.1732186374842766,
"tokens_per_second": 130.40765123003763
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 865.5675225800005,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.115531136960788,
"tokens_per_second": 86.97761646092924
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 540.2676798280002,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.37018686748700586,
"tokens_per_second": 271.7264154071495
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 764.7424081899953,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.13076298493329488,
"tokens_per_second": 98.44491320703105
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1303.4944151099999,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.15343372221746138,
"tokens_per_second": 112.62418795067208
"elapsed_time": 1052.5878375879984,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.09500394782173208,
"tokens_per_second": 71.52372211759099
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 310.4935437940003,
"num_requests": 100,
"total_num_tokens": 76062,
"requests_per_second": 0.3220678883627477,
"tokens_per_second": 244.97127724647316
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 914.8563823220001,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.21861354838273012,
"tokens_per_second": 162.71078485804028
"elapsed_time": 550.0459713920009,
"num_requests": 100,
"total_num_tokens": 76062,
"requests_per_second": 0.18180298593393945,
"tokens_per_second": 138.28298716107304
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 193.03236384499905,
"num_requests": 100,
"total_num_tokens": 74504,
"requests_per_second": 0.5180478444552329,
"tokens_per_second": 385.96636603292677
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 522.8661062630126,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.38250710383471637,
"tokens_per_second": 278.99494393048457
"elapsed_time": 311.826995067001,
"num_requests": 100,
"total_num_tokens": 74504,
"requests_per_second": 0.3206906444341466,
"tokens_per_second": 238.92735772921657
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 639.9174838529943,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.15627014814143225,
"tokens_per_second": 117.11822522607781
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1339.915984058,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.14926308990977954,
"tokens_per_second": 109.73523843987172
"elapsed_time": 1055.754198749999,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.09471901709545542,
"tokens_per_second": 70.98811455236003
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 271.0714236530039,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.36890646255655896,
"tokens_per_second": 276.48063742763867
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 468.4791132300161,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.42691337639593563,
"tokens_per_second": 313.85817605876395
"elapsed_time": 404.31172934999995,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.24733390782594175,
"tokens_per_second": 185.3668705592303
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 374.65702054300345,
"num_requests": 100,
"total_num_tokens": 75027,
"requests_per_second": 0.2669107864442698,
"tokens_per_second": 200.2551557455423
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 560.7857336160014,
"num_requests": 100,
"total_num_tokens": 75027,
"requests_per_second": 0.17832122681721982,
"tokens_per_second": 133.7890668441555
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 234.33056626700272,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.42674757114723666,
"tokens_per_second": 321.2769089381971
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 874.8529941339984,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.11430491827828541,
"tokens_per_second": 86.05445772580717
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 438.29837328500435,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.2281550790401287,
"tokens_per_second": 171.7665512553609
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 621.1952276929987,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.16097998751758127,
"tokens_per_second": 121.19378360261106
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 573.4174093670008,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.17439303091685104,
"tokens_per_second": 131.2917933257513
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 838.6815635629973,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.11923476602390883,
"tokens_per_second": 89.76589360109976
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 746.2110970310023,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.13401033621434522,
"tokens_per_second": 100.8896816189698
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1064.0833694909998,
"num_requests": 100,
"total_num_tokens": 75285,
"requests_per_second": 0.09397759881148657,
"tokens_per_second": 70.75103526522766
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 288.17777036500047,
"num_requests": 100,
"total_num_tokens": 76062,
"requests_per_second": 0.3470080286669645,
"tokens_per_second": 263.9412467646666
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 422.3444380089968,
"num_requests": 100,
"total_num_tokens": 76062,
"requests_per_second": 0.23677356915463818,
"tokens_per_second": 180.0947121704009
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 195.17220506099693,
"num_requests": 100,
"total_num_tokens": 74504,
"requests_per_second": 0.5123680391311207,
"tokens_per_second": 381.7346838742502
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 308.955628978998,
"num_requests": 100,
"total_num_tokens": 74504,
"requests_per_second": 0.32367107319089417,
"tokens_per_second": 241.1478963701438
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 619.8506736600029,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.16132917854155862,
"tokens_per_second": 120.90976614975652
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1103.3186353329984,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.09063564848591402,
"tokens_per_second": 67.92779311425312
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 260.26568330699956,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.38422276317559617,
"tokens_per_second": 287.95959208958226
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 405.08143226500033,
"num_requests": 100,
"total_num_tokens": 74946,
"requests_per_second": 0.24686394397504988,
"tokens_per_second": 185.01465145154089
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 1237.550695703001,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.16160954108339642,
"tokens_per_second": 118.62544339374007
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 540.6128817510034,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.36995048906754757,
"tokens_per_second": 275.34859975563967
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 455.23138687500614,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.43933701797875907,
"tokens_per_second": 320.4458308584372
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 1279.5375675789983,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.15630646967124087,
"tokens_per_second": 114.91339037290285
}
@@ -1,7 +0,0 @@
{
"elapsed_time": 460.97370730798866,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.43386422442175154,
"tokens_per_second": 318.9683005103833
}
+255 -238
Ver Arquivo
@@ -202,6 +202,26 @@
color: var(--primary);
}
/* Diff Styling */
.val-pos {
color: #16a34a;
font-weight: 600;
}
.val-neg {
color: #dc2626;
font-weight: 600;
}
.val-neu {
color: #9ca3af;
}
.col-diff {
background: #f9fafb;
font-size: 0.9rem;
}
/* Modal/Overlay */
#loading {
text-align: center;
@@ -433,11 +453,40 @@
</header>
<div class="controls">
<input type="text" id="searchInput" class="search" placeholder="Search models (e.g. 'llama', 'fp8')..."
autocomplete="off">
<select id="quantFilter">
<option value="">All Quantizations</option>
<input type="text" id="searchInput" class="search" placeholder="Search models..." autocomplete="off">
<select id="quantFilter" style="max-width: 150px;">
<option value="">All Quants</option>
</select>
<!-- Toggles -->
<div
style="display: flex; gap: 12px; align-items: center; border-left: 1px solid #e5e7eb; padding-left: 12px;">
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTP1" checked> TP1
</label>
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTP2" checked> TP2
</label>
</div>
<!-- Attention Group -->
<div
style="display: flex; align-items: center; gap: 8px; border-left: 1px solid #e5e7eb; padding-left: 12px;">
<span
style="font-size: 0.8rem; font-weight: 600; text-transform: uppercase; color: #9ca3af; letter-spacing: 0.05em;">Attention</span>
<div style="display: flex; gap: 12px;">
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTriton" checked> Triton
</label>
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleRocm"> ROCm
</label>
</div>
</div>
</div>
<nav id="tabNav" class="tab-nav">
@@ -469,6 +518,7 @@
<!-- Modal Overlay -->
<div id="modalOverlay" class="modal-overlay">
<!-- ... modal content ... -->
<div class="modal">
<div class="modal-header">
<h3 id="modalTitle">Benchmark Info</h3>
@@ -480,116 +530,74 @@
</div>
</div>
<!-- Script Logic Updates Below -->
<script>
// Helper - Defined at top to avoid ReferenceError
const $ = id => document.getElementById(id);
// State
let rawRuns = [];
let tests = [];
let state = {
search: "",
quant: "",
activeTab: "Throughput"
activeTab: "Throughput",
showTP1: true,
showTP2: true,
showTriton: true,
showRocm: false
};
// Benchmark Metadata
// Metadata
const BENCHMARK_INFO = {
"Throughput": {
short: "Maximum raw compute capacity (Tokens/Sec).",
desc: "Measures the absolute maximum number of tokens the system can generate per second by fully saturating the GPU compute capability.",
usecase: "Demonstrates the raw horsepower and architectural efficiency of the hardware/model combo under Heavy Load. This is the theoretical speed limit of the system.",
details: "Command: `vllm bench throughput`\nParams: --num-prompts 100 --output-len 512\nMetric: Tokens per Second (higher is better).",
usecase: "Demonstrates the raw horsepower and architectural efficiency.",
details: `
**Test Configuration:**
• <b>Dataset:</b> ShareGPT (Random Sample, 100 Prompts)
• <b>Output Length:</b> 512 Tokens (Fixed)
• <b>Batch Budget:</b> 8192 - 32768 Tokens (Dynamic per model)
• <b>GPU Alloc:</b> 90% VRAM per GPU
• <b>Pipeline:</b> <code>vllm bench throughput</code> (Offline)
• <b>Cluster Config:</b> Ray Distributed (RoCE v2 RDMA, TP=2)
<b>Metric:</b> Tokens per Second (higher is better).`,
unit: " tok/s"
},
"TTFT": {
short: "Time To First Token (Response Latency).",
desc: "The 'Time To First Token' is the delay between sending a request and seeing the first character of the response.",
usecase: "<b>Responsiveness</b>. Low TTFT makes the AI feel 'snappy' and instant. High TTFT feels like the AI is ignoring you or lagging. We measure at different QPS loads to ensure the server doesn't 'choke' when busy.",
context: "<b>QPS = Queries Per Second (Traffic Load)</b>.<br>• QPS 1.0 = 1 user sending a request every second.<br>• QPS 4.0 = 4 users sending requests every second (Simulates High Load).",
details: "Command: `vllm bench serve`\nParams: --random-input-len 1024 --random-output-len 512\nMetric: Milliseconds (lower is better).",
desc: "Delay between sending a request and seeing the first character.",
usecase: "Responsiveness. Low TTFT makes the AI feel 'snappy'.",
details: "Command: `vllm bench serve`\nMetric: Milliseconds (lower is better).",
unit: " ms"
},
"TPOT": {
short: "Time Per Output Token (Streaming Speed).",
desc: "The 'Time Per Output Token' measures how fast the text generates *after* the first token appears.",
usecase: "<b>1. Fluidity</b>: Industry standard is <50ms (>20 tok/s) for a 'fluid' feeling. Slower feels laggy.<br><b>2. Bottlenecks</b>: We test at <b>QPS 4.0</b> to find memory bandwidth bottlenecks where the GPU can't keep up with multiple users.",
context: "<b>QPS = Queries Per Second (Traffic Load)</b>.<br>• QPS 1.0 = Light Load (Ideal conditions)<br>• QPS 4.0 = Heavy Load (Stress Test)",
details: "Command: `vllm bench serve`\nParams: --random-input-len 1024 --random-output-len 512\nMetric: Milliseconds (lower is better).",
desc: "Measures how fast the text generates *after* the first token.",
usecase: "Fluidity. Industry standard is <50ms (>20 tok/s).",
details: "Command: `vllm bench serve`\nMetric: Milliseconds (lower is better).",
unit: " ms"
}
};
const $ = id => document.getElementById(id);
async function init() {
try {
const res = await fetch('results.json');
const data = await res.json();
rawRuns = data.runs || [];
processData();
setupControls();
render();
populateFilters();
} catch (e) {
$('loading').textContent = "Error loading results.json: " + e.message;
console.error(e);
}
}
function processData() {
const testGroups = {};
rawRuns.forEach(run => {
if (!run.test) return;
if (!testGroups[run.test]) {
testGroups[run.test] = {
name: run.test,
models: {}
};
}
// Normalize model name
const modelName = run.model_clean || run.model;
if (!testGroups[run.test].models[modelName]) {
testGroups[run.test].models[modelName] = {
name: modelName,
quant: run.quant,
params: run.params_b || run.name_params_b,
triton: null,
rocm: null
};
}
const m = testGroups[run.test].models[modelName];
// Assign Backend value
if (run.backend === "Triton") m.triton = run.tps_mean;
if (run.backend === "ROCm") m.rocm = run.tps_mean;
});
// Convert map to array for sorting
tests = Object.values(testGroups).map(group => {
return {
name: group.name,
models: Object.values(group.models)
};
});
// Sort tests: Throughput first, then others alphabetically
tests.sort((a, b) => {
const aTp = a.name.startsWith("Throughput");
const bTp = b.name.startsWith("Throughput");
if (aTp && !bTp) return -1;
if (!aTp && bTp) return 1;
return a.name.localeCompare(b.name);
});
// Set default tab if not set
if (!state.activeTab && tests.length > 0) {
state.activeTab = tests[0].name;
}
}
function populateFilters() {
function setupControls() {
// Filters
const quants = new Set(rawRuns.map(r => r.quant).filter(Boolean));
const sel = $('quantFilter');
[...quants].sort().forEach(q => {
@@ -599,22 +607,74 @@
sel.appendChild(opt);
});
$('searchInput').addEventListener('input', e => {
state.search = e.target.value.toLowerCase();
render();
});
$('searchInput').addEventListener('input', e => { state.search = e.target.value.toLowerCase(); render(); });
sel.addEventListener('change', e => { state.quant = e.target.value; render(); });
sel.addEventListener('change', e => {
state.quant = e.target.value;
render();
});
// Toggles
$('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); });
$('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); });
$('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); });
$('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); });
}
function getBenchmarkMeta(testName) {
if (testName.includes("Throughput")) return BENCHMARK_INFO["Throughput"];
if (testName.includes("TTFT")) return BENCHMARK_INFO["TTFT"];
if (testName.includes("TPOT")) return BENCHMARK_INFO["TPOT"];
return null;
function processData() {
const testGroups = {};
rawRuns.forEach(run => {
let testName = run.test;
if (!testGroups[testName]) {
testGroups[testName] = { name: testName, models: {} };
}
const modelName = run.model_clean || run.model;
if (!testGroups[testName].models[modelName]) {
testGroups[testName].models[modelName] = {
name: modelName,
quant: run.quant,
params: run.params_b || run.name_params_b,
results: {
1: { triton: null, rocm: null },
2: { triton: null, rocm: null }
}
};
}
const m = testGroups[testName].models[modelName];
const tp = run.tp || 1;
if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null };
if (run.backend === "Triton") m.results[tp].triton = run.tps_mean;
if (run.backend === "ROCm") m.results[tp].rocm = run.tps_mean;
});
tests = Object.values(testGroups).map(g => ({
name: g.name,
models: Object.values(g.models)
}));
tests.sort((a, b) => {
const aTp = a.name.includes("Throughput");
const bTp = b.name.includes("Throughput");
if (aTp && !bTp) return -1;
if (!aTp && bTp) return 1;
return a.name.localeCompare(b.name);
});
if (tests.length > 0) state.activeTab = tests[0].name;
}
function formatVal(v, unit) {
if (v === null || v === undefined) return '<span class="val-na">-</span>';
if (v === 0) return '<span class="val-na" style="color:#ef4444;font-weight:bold;">X</span>';
return `<span class="val">${v.toFixed(2)}<span style="font-size:0.75em; color:#9ca3af; margin-left:2px;">${unit}</span></span>`;
}
function getMeta(name) {
if (name.includes("Throughput")) return BENCHMARK_INFO["Throughput"];
if (name.includes("TTFT")) return BENCHMARK_INFO["TTFT"];
if (name.includes("TPOT")) return BENCHMARK_INFO["TPOT"];
return { short: "", desc: "", unit: "" };
}
function render() {
@@ -627,208 +687,165 @@
const btn = document.createElement('button');
btn.className = `tab-btn ${test.name === state.activeTab ? 'active' : ''}`;
btn.textContent = test.name;
btn.onclick = () => {
state.activeTab = test.name;
render();
};
btn.onclick = () => { state.activeTab = test.name; render(); };
tabNav.appendChild(btn);
});
// Ensure active tab exists (if search filtered it out logic?)
// Actually tabs are based on 'tests' which is processed from raw data, so they exist regardless of filters unless we want to hide tabs with no results.
// For now, let's keep tabs static based on available data types.
container.innerHTML = "";
// Find active test
const activeTest = tests.find(t => t.name === state.activeTab);
if (!activeTest) {
// If invalid tab (e.g. on first load if default doesn't exist), switch to first
if (tests.length > 0) {
state.activeTab = tests[0].name;
// Re-render immediately
setTimeout(render, 0);
}
container.innerHTML = '<div id="loading">No data available.</div>';
container.innerHTML = '<div id="loading">No Data</div>';
return;
}
// Render Active Tab Content
const test = activeTest;
// Cluster Info Box Logic
// If test name implies Tensor Parallelism > 1 (e.g. "Cluster", "TP=2", etc.)
// We default to checking if it's the "Throughput (Cluster)" tab or similar
if (test.name.toLowerCase().includes("tp=2") || test.name.toLowerCase().includes("cluster")) {
// Simplified Info Box logic
if (state.showTP2) {
const infoBox = document.createElement('div');
infoBox.className = 'info-box';
infoBox.style.cssText = "background:#f8fafc; border:1px solid #e2e8f0; border-radius:6px; padding:10px 16px; margin-bottom:20px; font-size:0.9rem; color:#64748b; display:flex; justify-content:space-between; align-items:center;";
infoBox.innerHTML = `
<div style="font-size:1.2rem;">️</div>
<div>
<div style="font-weight:600; margin-bottom:4px;">Distributed Cluster (Tensor Parallelism = 2)</div>
This benchmark runs on <b>2x Strix Halo nodes</b> connected via <b>Low-Latency RDMA (RoCE v2)</b>.
The model is split across both APUs, effectively using 256GB of Unified Memory.
<br><br>
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/blob/main/rdma_cluster/setup_guide.md" target="_blank">View Cluster Setup Guide &rarr;</a>
</div>
<span><b>TP2</b> = Distributed Cluster (2x Strix Halo, RDMA RoCE v2).</span>
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/blob/main/rdma_cluster/setup_guide.md" target="_blank" style="color:#3b82f6; text-decoration:none; font-weight:500;">Cluster Setup Guide &rarr;</a>
`;
container.appendChild(infoBox);
}
// Filter models within this test
const models = test.models.filter(m => {
const s = state.search;
const matchSearch = !s || m.name.toLowerCase().includes(s);
const q = state.quant;
const matchQuant = !q || m.quant === q;
return matchSearch && matchQuant;
});
if (models.length === 0) {
container.innerHTML += '<div id="loading">No models match current filters in this category.</div>';
return;
}
// Sorting models by size (small to large), then name
models.sort((a, b) => {
const pA = parseFloat(a.params) || 0;
const pB = parseFloat(b.params) || 0;
if (pA !== pB) return pA - pB;
return a.name.localeCompare(b.name);
});
// Models Filter & Sort
const models = activeTest.models.filter(m => {
const matchS = !state.search || m.name.toLowerCase().includes(state.search);
const matchQ = !state.quant || m.quant === state.quant;
return matchS && matchQ;
}).sort((a, b) => (parseFloat(a.params) || 0) - (parseFloat(b.params) || 0) || a.name.localeCompare(b.name));
// Create Table
const card = document.createElement('div');
card.className = "section-card";
// Metadata resolution
const meta = getBenchmarkMeta(test.name);
const shortDesc = meta ? `<span class="section-desc">${meta.short}</span>` : "";
const helpBtn = meta ? `<button class="btn-help" onclick="openModal('${test.name}')">?</button>` : "";
const meta = getMeta(activeTest.name);
const unit = meta.unit || "";
// Header
const header = document.createElement('div');
header.className = "section-header";
header.innerHTML = `
<div class="section-title-row">
<h2>${test.name}</h2>
${helpBtn}
</div>
${shortDesc}
`;
<div class="section-title-row">
<h2>${activeTest.name}</h2>
<button class="btn-help" onclick="openModal('${activeTest.name}')">?</button>
</div>
<span class="section-desc">${meta.short}</span>
`;
card.appendChild(header);
// Table
const tableResp = document.createElement('div');
tableResp.className = "table-responsive";
const table = document.createElement('table');
const thead = document.createElement('thead');
thead.innerHTML = `
<tr>
<th class="col-model">Model</th>
<th class="col-data">Triton Attention</th>
<th class="col-data">ROCm Attention</th>
</tr>
`;
table.appendChild(thead);
// Build Dynamic Columns
let cols = [];
if (state.showTP1) {
if (state.showTriton) cols.push({ id: "tp1_triton", label: "TP1 Triton" });
if (state.showRocm) cols.push({ id: "tp1_rocm", label: "TP1 ROCm" });
}
if (state.showTP2) {
if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 Triton" });
if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 ROCm" });
}
// Thead
let theadHtml = `<thead><tr><th class="col-model">Model</th>`;
cols.forEach(c => {
// Style differentiation for TP2
const style = c.id.startsWith("tp2") ? "background:#eff6ff; color:#1e40af;" : "";
theadHtml += `<th class="col-data" style="${style}">${c.label}</th>`;
});
// Diff Column Header
if (cols.length === 2) {
theadHtml += `<th class="col-data col-diff">Diff</th>`;
}
theadHtml += `</tr></thead>`;
table.innerHTML = theadHtml;
// Tbody
const tbody = document.createElement('tbody');
models.forEach(m => {
const tr = document.createElement('tr');
// Meta tags
// Model Name Cell
let metaHtml = "";
if (m.quant) metaHtml += `<span class="tag">${m.quant}</span>`;
if (m.params) metaHtml += `<span class="tag">${m.params}B</span>`;
// Values
// Pass unit from meta
const unit = meta ? meta.unit : "";
const val1 = formatVal(m.triton, unit);
let rowHtml = `
<td>
<div class="model-cell">
<a href="https://huggingface.co/${m.name}" target="_blank" class="model-name" style="text-decoration:none;color:inherit;">${m.name}</a>
<div class="model-meta">${metaHtml}</div>
</div>
</td>
`;
// Special handling for ROCm column where we want 'X' for crashes/missing if Triton has data
let val2;
if ((m.rocm === null || m.rocm === 0) && m.triton > 0) {
val2 = '<span class="val-na" style="color: #ef4444; font-weight:bold;">X</span>';
} else {
val2 = formatVal(m.rocm, unit);
// Data Cells
cols.forEach(c => {
let val = null;
if (c.id === "tp1_triton") val = m.results[1]?.triton;
if (c.id === "tp1_rocm") val = m.results[1]?.rocm;
if (c.id === "tp2_triton") val = m.results[2]?.triton;
if (c.id === "tp2_rocm") val = m.results[2]?.rocm;
const bg = c.id.startsWith("tp2") ? 'style="background:#fbfdff;"' : "";
rowHtml += `<td class="col-data" ${bg}>${formatVal(val, unit)}</td>`;
});
// Diff Column Data
if (cols.length === 2) {
const v1 = getVal(m, cols[0].id);
const v2 = getVal(m, cols[1].id);
rowHtml += `<td class="col-data col-diff">${formatDiff(v1, v2)}</td>`;
}
tr.innerHTML = `
<td>
<div class="model-cell">
<a href="https://huggingface.co/${m.name}" target="_blank" class="model-name" style="text-decoration: none; color: inherit; border-bottom: 1px dotted #ccc;">${m.name}</a>
<div class="model-meta">${metaHtml}</div>
</div>
</td>
<td class="col-data">${val1}</td>
<td class="col-data">${val2}</td>
`;
tr.innerHTML = rowHtml;
tbody.appendChild(tr);
});
table.appendChild(tbody);
tableResp.appendChild(table);
card.appendChild(tableResp);
container.appendChild(card);
}
function formatVal(v, unit) {
if (v === null || v === undefined) return '<span class="val-na">N/A</span>';
if (v === 0) return '<span class="val-na">FAIL</span>';
return `<span class="val">${v.toFixed(2)}<span style="font-size:0.8em; color:#888;">${unit}</span></span>`;
// Helper to get value safely
function getVal(m, colId) {
if (colId === "tp1_triton") return m.results[1]?.triton;
if (colId === "tp1_rocm") return m.results[1]?.rocm;
if (colId === "tp2_triton") return m.results[2]?.triton;
if (colId === "tp2_rocm") return m.results[2]?.rocm;
return null;
}
// Modal Logic
function openModal(testName) {
const meta = getBenchmarkMeta(testName);
if (!meta) return;
function formatDiff(v1, v2) {
if (v1 === null || v2 === null || v1 === undefined || v2 === undefined || v1 === 0) return '<span class="val-na">-</span>';
const diff = ((v2 - v1) / v1) * 100;
const sign = diff > 0 ? "+" : "";
const cls = diff > 0.5 ? "val-pos" : (diff < -0.5 ? "val-neg" : "val-neu");
return `<span class="${cls}">${sign}${diff.toFixed(1)}%</span>`;
}
$('modalTitle').textContent = testName;
let content = `
<div class="modal-section">
<h4>What is this?</h4>
<p>${meta.desc}</p>
</div>
<div class="modal-section">
<h4>Why it matters?</h4>
<p>${meta.usecase}</p>
</div>`;
if (meta.context) {
content += `
<div class="modal-section">
<h4>Terminology</h4>
<p>${meta.context}</p>
</div>`;
}
content += `
<div class="modal-section">
<h4>Technical Details</h4>
<div class="code-block">${meta.details}</div>
</div>
`;
$('modalContent').innerHTML = content;
// --- Basic Modal Implementation ---
function openModal(name) {
const m = getMeta(name);
$('modalTitle').textContent = name;
$('modalContent').innerHTML = `
<div class="modal-section"><h4>About</h4><p>${m.desc}</p></div>
<div class="modal-section"><h4>Usage</h4><p>${m.usecase}</p></div>
<div class="modal-section"><h4>Details</h4><div class="code-block">${m.details}</div></div>
`;
$('modalOverlay').classList.add('active');
}
function closeModal() {
$('modalOverlay').classList.remove('active');
}
// Close on click outside
$('modalOverlay').addEventListener('click', e => {
if (e.target === $('modalOverlay')) closeModal();
});
// Close on Escape
document.addEventListener('keydown', e => {
if (e.key === "Escape") closeModal();
});
function closeModal() { $('modalOverlay').classList.remove('active'); }
$('modalOverlay').addEventListener('click', e => { if (e.target === $('modalOverlay')) closeModal(); });
document.addEventListener('keydown', e => { if (e.key === "Escape") closeModal(); });
init();
</script>
+10 -3
Ver Arquivo
@@ -71,6 +71,10 @@ def parse_logs():
model_display = model_part.replace("_", "/", 1)
else:
model_display = model_part
# Normalize: Remove _cluster suffix if present so grouping works
if model_display.endswith("_cluster"):
model_display = model_display[:-8]
params_b, quant = extract_meta(model_display)
@@ -89,7 +93,8 @@ def parse_logs():
if "throughput" in fname:
tps = data.get("tokens_per_second", 0)
run = base_run.copy()
run["test"] = f"Throughput (TP{tp})"
run["test"] = "Throughput"
run["tp"] = tp
run["tps_mean"] = tps
if tps == 0 or (isinstance(data, dict) and "error" in str(data).lower()): # checking if error string is in json dump
run["error"] = True
@@ -111,13 +116,15 @@ def parse_logs():
# TTFT
r1 = base_run.copy()
r1["test"] = f"TTFT (TP{tp}) @ QPS {qps}"
r1["test"] = f"TTFT (QPS {qps})"
r1["tp"] = tp
r1["tps_mean"] = ttft
runs.append(r1)
# TPOT
r2 = base_run.copy()
r2["test"] = f"TPOT (TP{tp}) @ QPS {qps}"
r2["test"] = f"TPOT (QPS {qps})"
r2["tp"] = tp
r2["tps_mean"] = tpot
runs.append(r2)
+368 -48
Ver Arquivo
@@ -1,18 +1,5 @@
{
"runs": [
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP1",
"gpu_config": "single",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 112.69232830266365
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -24,7 +11,8 @@
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 278.99494393048457
"tp": 1,
"tps_mean": 238.92735772921657
},
{
"model": "google/gemma-3-12b-it",
@@ -37,7 +25,92 @@
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 162.71078485804028
"tp": 1,
"tps_mean": 138.28298716107304
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP1",
"gpu_config": "single",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 83.68710295019198
},
{
"model": "openai/gpt-oss-20b",
"model_clean": "openai/gpt-oss-20b",
"env": "TP1",
"gpu_config": "single",
"quant": "BF16",
"params_b": 20.0,
"name_params_b": 20.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 185.3668705592303
},
{
"model": "openai/gpt-oss-120b",
"model_clean": "openai/gpt-oss-120b",
"env": "TP1",
"gpu_config": "single",
"quant": "BF16",
"params_b": 120.0,
"name_params_b": 120.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 70.98811455236003
},
{
"model": "zai-org/GLM-4.7-Flash",
"model_clean": "zai-org/GLM-4.7-Flash",
"env": "TP1",
"gpu_config": "single",
"quant": "BF16",
"params_b": null,
"name_params_b": null,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 133.7890668441555
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 117.75790760733192
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 86.97761646092924
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
@@ -50,59 +123,134 @@
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 112.62418795067208
"tp": 1,
"tps_mean": 71.52372211759099
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 8.0,
"name_params_b": 8.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 385.96636603292677
},
{
"model": "google/gemma-3-12b-it",
"model_clean": "google/gemma-3-12b-it",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 12.0,
"name_params_b": 12.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 244.97127724647316
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP2",
"gpu_config": "dual",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 328.49937859629955
},
{
"model": "openai/gpt-oss-20b",
"model_clean": "openai/gpt-oss-20b",
"env": "TP1",
"gpu_config": "single",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 20.0,
"name_params_b": 20.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 313.85817605876395
},
{
"model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"model_clean": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 271.7264154071495
"tp": 2,
"tps_mean": 276.48063742763867
},
{
"model": "openai/gpt-oss-120b",
"model_clean": "openai/gpt-oss-120b",
"env": "TP1",
"gpu_config": "single",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 120.0,
"name_params_b": 120.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 109.73523843987172
"tp": 2,
"tps_mean": 117.11822522607781
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP1",
"gpu_config": "single",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "ROCm",
"model": "zai-org/GLM-4.7-Flash",
"model_clean": "zai-org/GLM-4.7-Flash",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": null,
"name_params_b": null,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tps_mean": 118.62544339374007
"tp": 2,
"tps_mean": 200.2551557455423
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 170.28562676904787
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 130.40765123003763
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 80.0,
"name_params_b": 80.0,
"backend": "Triton",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 98.44491320703105
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -115,7 +263,8 @@
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tps_mean": 320.4458308584372
"tp": 1,
"tps_mean": 241.1478963701438
},
{
"model": "google/gemma-3-12b-it",
@@ -128,7 +277,22 @@
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tps_mean": 275.34859975563967
"tp": 1,
"tps_mean": 180.0947121704009
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP1",
"gpu_config": "single",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 86.05445772580717
},
{
"model": "openai/gpt-oss-20b",
@@ -141,7 +305,8 @@
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tps_mean": 318.9683005103833
"tp": 1,
"tps_mean": 185.01465145154089
},
{
"model": "openai/gpt-oss-120b",
@@ -154,7 +319,162 @@
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tps_mean": 114.91339037290285
"tp": 1,
"tps_mean": 67.92779311425312
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 121.19378360261106
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 89.76589360109976
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"env": "TP1",
"gpu_config": "single",
"quant": "GPTQ",
"params_b": 80.0,
"name_params_b": 80.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 1,
"tps_mean": 70.75103526522766
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 8.0,
"name_params_b": 8.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 381.7346838742502
},
{
"model": "google/gemma-3-12b-it",
"model_clean": "google/gemma-3-12b-it",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 12.0,
"name_params_b": 12.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 263.9412467646666
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"model_clean": "Qwen/Qwen3-14B-AWQ",
"env": "TP2",
"gpu_config": "dual",
"quant": "AWQ",
"params_b": 14.0,
"name_params_b": 14.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 321.2769089381971
},
{
"model": "openai/gpt-oss-20b",
"model_clean": "openai/gpt-oss-20b",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 20.0,
"name_params_b": 20.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 287.95959208958226
},
{
"model": "openai/gpt-oss-120b",
"model_clean": "openai/gpt-oss-120b",
"env": "TP2",
"gpu_config": "dual",
"quant": "BF16",
"params_b": 120.0,
"name_params_b": 120.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 120.90976614975652
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 171.7665512553609
},
{
"model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"model_clean": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 30.0,
"name_params_b": 30.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 131.2917933257513
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"env": "TP2",
"gpu_config": "dual",
"quant": "GPTQ",
"params_b": 80.0,
"name_params_b": 80.0,
"backend": "ROCm",
"error": false,
"test": "Throughput",
"tp": 2,
"tps_mean": 100.8896816189698
}
]
}