feat: Enhance vLLM benchmarking to compare Triton and ROCm attention, introduce a new script for cluster configuration, and update Dockerfile for new tools and dependencies.

2026-02-01 19:36:07 +00:00
@@ -329,5 +329,149 @@
    "real_capacity": 0,
    "status": "fail",
    "error": "Verification Failed"
+  },
+  {
+    "model": "zai-org/GLM-4.7-Flash",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 202752,
+    "configured_len": 162201,
+    "real_capacity": 791088,
+    "status": "success",
+    "error": "Timeout",
+    "max_context_1_user": 162201
+  },
+  {
+    "model": "zai-org/GLM-4.7-Flash",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 202752,
+    "configured_len": 162201,
+    "real_capacity": 791056,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 162201
+  },
+  {
+    "model": "zai-org/GLM-4.7-Flash",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 202752,
+    "configured_len": 162201,
+    "real_capacity": 791008,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 162201
+  },
+  {
+    "model": "zai-org/GLM-4.7-Flash",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 202752,
+    "configured_len": 162201,
+    "real_capacity": 790992,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 162201
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 1089152,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 1089072,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 1089072,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 1089056,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 262144,
+    "configured_len": 209715,
+    "real_capacity": 936544,
+    "status": "success",
+    "error": "Timeout",
+    "max_context_1_user": 209715
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 262144,
+    "configured_len": 209715,
+    "real_capacity": 937312,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 209715
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 262144,
+    "configured_len": 209715,
+    "real_capacity": 937296,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 209715
+  },
+  {
+    "model": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 262144,
+    "configured_len": 209715,
+    "real_capacity": 937280,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 209715
  }
 ]
@@ -76,7 +76,7 @@ MODEL_TABLE = {
        "max_tokens": "32768"
    },

-    # 5. Qwen 80B AWQ (The Big One) [NEW]
+    # 5. Qwen 80B AWQ
    # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
    # Config: 20k ctx fits in that cache. Eager mode required for stability.
     "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
@@ -176,14 +176,17 @@ def get_model_args(model, tp_size):
    
    return cmd

-def run_throughput(model, tp_size):
+def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None):
    if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
    
    model_safe = model.replace("/", "_")
-    output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json"
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+    
+    output_file = output_dir_path / f"{model_safe}_tp{tp_size}_throughput.json"
    
    if output_file.exists():
-        log(f"SKIP Throughput {model} (TP={tp_size})")
+        log(f"SKIP {model} (TP={tp_size} | {backend_name})")
        return

    dataset_path = get_dataset()
@@ -192,7 +195,7 @@ def run_throughput(model, tp_size):
    # Retrieve Model-Specific Batch Tokens
    batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)

-    log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...")
+    log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...")
    kill_vllm()
    nuke_vllm_cache()

@@ -212,31 +215,43 @@ def run_throughput(model, tp_size):
    # Inject model specific env vars (e.g. for AWQ)
    model_env = MODEL_TABLE[model].get("env", {})
    env.update(model_env)
+    
+    # Extra Env
+    if extra_env:
+        env.update(extra_env)

    try: 
        subprocess.run(cmd, check=True, env=env)
    except: 
-        log(f"ERROR: Throughput failed {model}")
-
+        log(f"ERROR: Failed {model} [{backend_name}]")


 def print_summary(tps):
-    print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}")
-    print("-" * 60)
+    print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}")
+    print("-" * 75)
    
    for m in MODELS_TO_RUN:
        msafe = m.replace("/", "_")
        for tp in tps:
            if tp not in MODEL_TABLE[m]["valid_tp"]: continue
            
+            # Default
            try: 
-                tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text())
-                tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
-            except: tok_s = "N/A"
+                p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json"
+                d1 = json.loads(p1.read_text())
+                val1 = f"{d1.get('tokens_per_second', 0):.1f}"
+            except: val1 = "N/A"
+            
+            # ROCm
+            try:
+                p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_tp{tp}_throughput.json"
+                d2 = json.loads(p2.read_text())
+                val2 = f"{d2.get('tokens_per_second', 0):.1f}"
+            except: val2 = "N/A"

            name_cell = m.split('/')[-1]
-            print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}")
-    print("-" * 60)
+            print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}")
+    print("-" * 75)

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@@ -254,5 +269,13 @@ if __name__ == "__main__":
    kill_vllm()
    for tp in valid_tp_args:
        for m in MODELS_TO_RUN:
-            run_throughput(m, tp)
+            # 1. Default (Triton)
+            run_throughput(m, tp, "Default", RESULTS_DIR)
+            
+            # 2. ROCm Attention
+            run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm_attn/benchmark_results", {
+                "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1",
+                "VLLM_USE_TRITON_FLASH_ATTN": "0"
+            })
+            
    print_summary(valid_tp_args)
@@ -152,22 +152,21 @@ def get_model_args(model):
        
    if config.get("trust_remote"): cmd.append("--trust-remote-code")
    
-    # FORCED EAGER as per request for cluster stability
-    cmd.append("--enforce-eager")
+    # Respect config for Eager Mode (Apple-to-Apples with TP=1)
+    if config.get("enforce_eager"): 
+        cmd.append("--enforce-eager")
    
    return cmd

-def run_cluster_throughput(model):
-    # Skip if TP=2 is not valid for this model
-    if CLUSTER_TP not in MODEL_TABLE[model]["valid_tp"]:
-        log(f"SKIP {model} (Support TP={MODEL_TABLE[model]['valid_tp']}, Cluster is TP={CLUSTER_TP})")
-        return
-
+def run_bench_set(model, backend_name, output_dir, extra_env=None):
    model_safe = model.replace("/", "_")
-    output_file = RESULTS_DIR / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+    
+    output_file = output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
    
    if output_file.exists():
-        log(f"SKIP {model} (Result exists)")
+        log(f"SKIP {model} [{backend_name}] (Result exists)")
        return

    dataset_path = get_dataset()
@@ -175,14 +174,10 @@ def run_cluster_throughput(model):
    
    batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)

-    log(f"START Cluster Bench {model} [TP={CLUSTER_TP} | Eager=True]...")
+    log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...")
    
-    # Nuke cache between runs to be safe
    nuke_vllm_cache()

-    # NOTE: We use 'vllm bench throughput' directly. 
-    # It supports most 'vllm serve' args but we need to ensure it picks up the ray backend.
-    
    cmd = ["vllm", "bench", "throughput"] + get_model_args(model)
    cmd.extend([
        "--num-prompts", str(OFF_NUM_PROMPTS),
@@ -195,43 +190,72 @@ def run_cluster_throughput(model):

    env = get_cluster_env()
    
-    # Add model specific envs
+    # Model specific envs
    model_env = MODEL_TABLE[model].get("env", {})
    env.update(model_env)
+    
+    # Run specific envs (e.g. ROCm attention)
+    if extra_env:
+        env.update(extra_env)

    try: 
        log(f"Command: {' '.join(cmd)}")
        subprocess.run(cmd, check=True, env=env)
    except subprocess.CalledProcessError as e:
-        log(f"ERROR: Cluster Benchmark failed for {model} (Exit {e.returncode})")
+        log(f"ERROR: Failed {model} [{backend_name}] (Exit {e.returncode})")
    except Exception as e:
        log(f"ERROR: System error: {e}")

+def run_cluster_throughput(model):
+    # 1. Default Run (Triton usually, unless global envs set)
+    run_bench_set(
+        model, 
+        "Default", 
+        RESULTS_DIR
+    )
+    
+    # 2. ROCm Attention Run
+    run_bench_set(
+        model,
+        "ROCm-Attn",
+        "benchmark_results_rocm_attn/benchmark_results",
+        extra_env={
+            "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1",
+            "VLLM_USE_TRITON_FLASH_ATTN": "0"
+        }
+    )
+
 def print_summary():
-    print(f"\n{'MODEL (Cluster TP=2)':<50} | {'TOK/S':<10}")
-    print("-" * 65)
+    print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
+    print("-" * 75)
    
    for m in MODELS_TO_RUN:
        msafe = m.replace("/", "_")
+        
+        # Default
        try: 
-            tdata = json.loads((RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json").read_text())
-            tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
-        except: 
-            if CLUSTER_TP not in MODEL_TABLE[m]["valid_tp"]:
-                tok_s = "SKIP"
-            else:
-                tok_s = "N/A"
+            p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
+            d1 = json.loads(p1.read_text())
+            val1 = f"{d1.get('tokens_per_second', 0):.1f}"
+        except: val1 = "N/A"
+        
+        # ROCm
+        try:
+            p2 = Path("benchmark_results_rocm_attn/benchmark_results") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
+            d2 = json.loads(p2.read_text())
+            val2 = f"{d2.get('tokens_per_second', 0):.1f}"
+        except: val2 = "N/A"

        name_cell = m.split('/')[-1]
-        print(f"{name_cell:<50} | {tok_s:<10}")
-    print("-" * 65)
+        print(f"{name_cell:<50} | {val1:<8} | {val2:<8}")
+    print("-" * 75)

 if __name__ == "__main__":
    if not check_ray_status():
        log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
        sys.exit(1)
        
-    log("Ray Cluster Detected. Starting Benchmarks...")
+    log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
    
    for m in MODELS_TO_RUN:
        run_cluster_throughput(m)