feat: Add script to automate README benchmark table generation and update max context benchmarks with new models and a kernel parameter change.
이 커밋은 다음에 포함됨:
@@ -36,10 +36,12 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
|
|||||||
| **`meta-llama/Meta-Llama-3.1-8B-Instruct`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
| **`meta-llama/Meta-Llama-3.1-8B-Instruct`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
||||||
| **`google/gemma-3-12b-it`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
| **`google/gemma-3-12b-it`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
||||||
| **`openai/gpt-oss-20b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
| **`openai/gpt-oss-20b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
||||||
| **`Qwen/Qwen3-14B-AWQ`** | 1 | 40k (0.90) | 40k (0.90) | 40k (0.90) | 40k (0.90) |
|
| **`Qwen/Qwen3-14B-AWQ`** | 1 | 40k (0.95) | 40k (0.95) | 40k (0.95) | 40k (0.95) |
|
||||||
| **`cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit`** | 1 | 256k (0.95) | 204k (0.90) | - | - |
|
| **`btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
|
||||||
| **`dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16`** | 1 | 256k (0.90) | - | - | - |
|
| **`btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
|
||||||
|
| **`dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16`** | 1 | 256k (0.95) | 256k (0.95) | 256k (0.95) | 256k (0.95) |
|
||||||
| **`openai/gpt-oss-120b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
| **`openai/gpt-oss-120b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) |
|
||||||
|
| **`zai-org/GLM-4.7-Flash`** | 1 | 198k (0.95) | 198k (0.95) | 198k (0.95) | 198k (0.95) |
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -184,7 +186,7 @@ amd_iommu=pt amdgpu.gttsize=126976 ttm.pages_limit=32505856
|
|||||||
|
|
||||||
| Parameter | Purpose |
|
| Parameter | Purpose |
|
||||||
|-----------------------------|--------------------------------------------------------------------------------------------|
|
|-----------------------------|--------------------------------------------------------------------------------------------|
|
||||||
| `amd_iommu=pt` | Sets IOMMU to pass-through mode; reduces DMA overhead for better performance |
|
| `amd_iommu=off` | Disables AMD IOMMU to reduce overhead for better performance |
|
||||||
| `amdgpu.gttsize=126976` | Caps GPU unified memory to 124 GiB; 126976 MiB ÷ 1024 = 124 GiB |
|
| `amdgpu.gttsize=126976` | Caps GPU unified memory to 124 GiB; 126976 MiB ÷ 1024 = 124 GiB |
|
||||||
| `ttm.pages_limit=32505856` | Caps pinned memory to 124 GiB; 32505856 × 4 KiB = 126976 MiB = 124 GiB |
|
| `ttm.pages_limit=32505856` | Caps pinned memory to 124 GiB; 32505856 × 4 KiB = 126976 MiB = 124 GiB |
|
||||||
|
|
||||||
|
|||||||
+83
@@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Config
|
||||||
|
RESULTS_FILE = Path(__file__).parent.parent / "benchmarks/max_context_results.json"
|
||||||
|
|
||||||
|
ORDER = [
|
||||||
|
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"google/gemma-3-12b-it",
|
||||||
|
"openai/gpt-oss-20b",
|
||||||
|
"Qwen/Qwen3-14B-AWQ",
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||||
|
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
|
||||||
|
"openai/gpt-oss-120b",
|
||||||
|
"zai-org/GLM-4.7-Flash"
|
||||||
|
]
|
||||||
|
|
||||||
|
def format_tokens(n):
|
||||||
|
if n >= 1024:
|
||||||
|
return f"{int(n/1024)}k"
|
||||||
|
return str(n)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not RESULTS_FILE.exists():
|
||||||
|
print(f"Error: {RESULTS_FILE} not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(RESULTS_FILE, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Organize data: model -> tp -> requests -> result
|
||||||
|
models = {}
|
||||||
|
|
||||||
|
for entry in data:
|
||||||
|
if entry["status"] != "success":
|
||||||
|
continue
|
||||||
|
|
||||||
|
model = entry["model"]
|
||||||
|
tp = entry["tp"]
|
||||||
|
seqs = entry["max_seqs"]
|
||||||
|
util = float(entry["util"])
|
||||||
|
ctx = entry["max_context_1_user"]
|
||||||
|
|
||||||
|
if model not in models:
|
||||||
|
models[model] = {}
|
||||||
|
|
||||||
|
if tp not in models[model]:
|
||||||
|
models[model][tp] = {}
|
||||||
|
|
||||||
|
# Store tuple (ctx, util)
|
||||||
|
# If multiple entries for same seqs (e.g. diff utils), pick standard logic?
|
||||||
|
# The JSON usually has the best working one or we filter.
|
||||||
|
# Assuming unique best entry per seqs/tp tuple from the finder script behavior.
|
||||||
|
models[model][tp][seqs] = (ctx, util)
|
||||||
|
|
||||||
|
# Generate Table
|
||||||
|
print("| Model | TP | 1 Req | 4 Reqs | 8 Reqs | 16 Reqs |")
|
||||||
|
print("| :--- | :--- | :--- | :--- | :--- | :--- |")
|
||||||
|
|
||||||
|
for model_name in ORDER:
|
||||||
|
if model_name not in models:
|
||||||
|
# Identify if there's a different naming or just missing
|
||||||
|
continue
|
||||||
|
|
||||||
|
tps = sorted(models[model_name].keys())
|
||||||
|
for tp in tps:
|
||||||
|
row = [f"**`{model_name}`**", str(tp)]
|
||||||
|
|
||||||
|
for req in [1, 4, 8, 16]:
|
||||||
|
val = models[model_name][tp].get(req)
|
||||||
|
if val:
|
||||||
|
ctx, util = val
|
||||||
|
row.append(f"{format_tokens(ctx)} ({util:.2f})")
|
||||||
|
else:
|
||||||
|
row.append("-")
|
||||||
|
|
||||||
|
print("| " + " | ".join(row) + " |")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
새 이슈에서 참조
사용자 차단