diff --git a/docs/index.html b/docs/index.html index ec5195a..43d15c2 100644 --- a/docs/index.html +++ b/docs/index.html @@ -556,13 +556,15 @@ usecase: "Demonstrates the raw horsepower and architectural efficiency.", details: ` **Test Configuration:** -• Dataset: ShareGPT (Random Sample, 100 Prompts) +• Dataset: ShareGPT (Random Sample, 200 Prompts) • Output Length: 512 Tokens (Fixed) -• Batch Budget: 8192 - 32768 Tokens (Dynamic per model) +• Concurrency: 64 Sequences (Saturates Memory Bandwidth) • GPU Alloc: 90% VRAM per GPU • Pipeline: vllm bench throughput (Offline) • Cluster Config: Ray Distributed (RoCE v2 RDMA, TP=2) +Rationale: Throughput is maximized by increasing batch size (64) to utilize the massive memory bandwidth of Strix Halo, and running more prompts (200) to measure sustained steady-state performance. + Metric: Tokens per Second (higher is better).`, unit: " tok/s" }, diff --git a/scripts/models.py b/scripts/models.py index 3ca4281..6e0d169 100644 --- a/scripts/models.py +++ b/scripts/models.py @@ -4,14 +4,14 @@ MODEL_TABLE = { "meta-llama/Meta-Llama-3.1-8B-Instruct": { "trust_remote": False, "valid_tp": [1, 2], - "max_num_seqs": "24", # Strix Halo Optimized (Bandwidth Limit) + "max_num_seqs": "64", # Strix Halo Optimized (Bus Batch Scale) "max_tokens": "32768" }, "google/gemma-3-12b-it": { "trust_remote": False, "valid_tp": [1, 2], - "max_num_seqs": "24", + "max_num_seqs": "64", "max_tokens": "32768" }, # 2. GPT-OSS 20B (MXFP4) @@ -19,7 +19,7 @@ MODEL_TABLE = { "openai/gpt-oss-20b": { "trust_remote": True, "valid_tp": [1, 2], - "max_num_seqs": "24", + "max_num_seqs": "64", "max_tokens": "8192" }, @@ -34,7 +34,7 @@ MODEL_TABLE = { "Qwen/Qwen3-14B-AWQ": { "trust_remote": True, "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "24", # Strix Halo Optimized + "max_num_seqs": "64", # Strix Halo Optimized "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive "enforce_eager": False, "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error @@ -45,7 +45,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "24", + "max_num_seqs": "64", "max_tokens": "32768" }, @@ -53,7 +53,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "24", + "max_num_seqs": "64", "max_tokens": "32768" }, @@ -61,7 +61,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "24", + "max_num_seqs": "64", "max_tokens": "32768", }, @@ -71,7 +71,7 @@ MODEL_TABLE = { "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { "trust_remote": True, "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "16", # Large Model / Bandwidth Constrained + "max_num_seqs": "64", # Large Model / Bandwidth Constrained "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive "enforce_eager": True, "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error @@ -93,6 +93,6 @@ MODELS_TO_RUN = [ # Hardware / Global Defaults GPU_UTIL = "0.90" -OFF_NUM_PROMPTS = 100 # Reduced for Strix Halo (Bandwidth Limited) +OFF_NUM_PROMPTS = 200 # Increased for Strix Halo (Steady State Saturation) OFF_FORCED_OUTPUT = "512" DEFAULT_BATCH_TOKENS = "8192"