diff --git a/Dockerfile b/Dockerfile index d73084c..d891f44 100644 --- a/Dockerfile +++ b/Dockerfile @@ -125,8 +125,9 @@ RUN chmod -R a+rwX /opt && \ COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh -COPY scripts/start_vllm.py /usr/local/bin/start-vllm -COPY scripts/start_vllm_cluster.py /usr/local/bin/start-vllm-cluster +COPY scripts/start_vllm.py /opt/start-vllm +COPY scripts/start_vllm_cluster.py /opt/start-vllm-cluster +COPY scripts/models.py /opt/models.py COPY benchmarks/max_context_results.json /opt/max_context_results.json COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py @@ -134,7 +135,10 @@ COPY benchmarks/find_max_context.py /opt/find_max_context.py COPY rdma_cluster/compare_eth_vs_rdma.sh /opt/compare_eth_vs_rdma.sh COPY scripts/configure_cluster.sh /opt/configure_cluster.sh RUN chmod +x /opt/configure_cluster.sh -RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod +x /opt/compare_eth_vs_rdma.sh && chmod +x /opt/find_max_context.py && chmod 0644 /opt/max_context_results.json +RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py /opt/compare_eth_vs_rdma.sh /opt/find_max_context.py /opt/run_vllm_bench.py && \ + ln -s /opt/start-vllm /usr/local/bin/start-vllm && \ + ln -s /opt/start-vllm-cluster /usr/local/bin/start-vllm-cluster && \ + chmod 0644 /etc/profile.d/*.sh /opt/max_context_results.json /opt/models.py RUN chmod 0644 /etc/profile.d/*.sh RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh diff --git a/benchmarks/run_vllm_bench.py b/benchmarks/run_vllm_bench.py index 14ba0be..a2e2f7d 100644 --- a/benchmarks/run_vllm_bench.py +++ b/benchmarks/run_vllm_bench.py @@ -2,17 +2,32 @@ import subprocess, time, json, sys, os, requests, argparse from pathlib import Path + # ========================= # ⚙️ GLOBAL SETTINGS # ========================= -# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5) -GPU_UTIL = "0.90" -# 1. THROUGHPUT CONFIG -OFF_NUM_PROMPTS = 200 -OFF_FORCED_OUTPUT = "512" -# Default fallback if not specified in MODEL_TABLE -DEFAULT_BATCH_TOKENS = "8192" +try: + import models +except ImportError: + # If running locally and models.py is in ../scripts? + # Or if running in /opt where models.py is alongside. + # We will try adding current dir to path just in case + sys.path.append(os.getcwd()) + try: + import models + except ImportError: + # Fallback for local structure: assuming this is in benchmarks/ and models is in scripts/ + sys.path.append(str(Path(__file__).parent.parent / "scripts")) + import models + +# Import from shared config +MODEL_TABLE = models.MODEL_TABLE +MODELS_TO_RUN = models.MODELS_TO_RUN +GPU_UTIL = models.GPU_UTIL +OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS +OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT +DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS # Fallbacks FALLBACK_INPUT_LEN = 1024 @@ -21,84 +36,6 @@ FALLBACK_OUTPUT_LEN = 512 RESULTS_DIR = Path("benchmark_results") RESULTS_DIR.mkdir(exist_ok=True) -# ========================= -# 🛠️ MODEL CONFIGURATION 🛠️ -# ========================= - -MODEL_TABLE = { - # 1. Llama 3.1 8B Instruct - # MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety. - "meta-llama/Meta-Llama-3.1-8B-Instruct": { - "trust_remote": False, - "valid_tp": [1, 2], - "max_num_seqs": "64", - "max_tokens": "32768" - }, - - "google/gemma-3-12b-it": { - "trust_remote": False, - "valid_tp": [1, 2], - "max_num_seqs": "64", - "max_tokens": "32768" - }, - # 2. GPT-OSS 20B (MXFP4) - # MAD Row 0 uses 8192. We match this exactly. - "openai/gpt-oss-20b": { - "trust_remote": True, - "valid_tp": [1, 2], - "max_num_seqs": "64", - "max_tokens": "8192" - }, - - "openai/gpt-oss-120b": { - "trust_remote": True, - "valid_tp": [1], - "max_num_seqs": "64", - "max_tokens": "8192" - }, - - - "Qwen/Qwen3-14B-AWQ": { - "trust_remote": True, - "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "32", # Lower concurrency for safety - "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive - "enforce_eager": False, - "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error - }, - - # 4. Qwen 30B 4-bit - "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": { - "trust_remote": True, - "enforce_eager": False, - "valid_tp": [1, 2], - "max_num_seqs": "64", - "max_tokens": "32768" - }, - - # 5. Qwen 80B AWQ - # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB. - # Config: 20k ctx fits in that cache. Eager mode required for stability. - "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { - "trust_remote": True, - "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "32", # Lower concurrency for safety - "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive - "enforce_eager": True, - "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error - }, - -} - -MODELS_TO_RUN = [ - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "google/gemma-3-12b-it", - "Qwen/Qwen3-14B-AWQ", - "openai/gpt-oss-20b", - "openai/gpt-oss-120b", - "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", - "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", -] # ========================= # UTILS diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py index 3a15aee..555c070 100755 --- a/benchmarks/vllm_cluster_bench.py +++ b/benchmarks/vllm_cluster_bench.py @@ -25,12 +25,21 @@ RESULTS_DIR.mkdir(exist_ok=True) # Since this is a new file in root/benchmarks? No, likely scripts/ or same dir. # Let's assume it's in the same dir as run_vllm_bench.py. + try: - from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN + import models except ImportError: - # Fallback if run directly and path issues - sys.path.append(os.path.dirname(__file__)) - from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN + # If in /opt, this should work if path includes ., otherwise: + sys.path.append(os.getcwd()) + try: + import models + # Also try parent/scripts for local dev if above failed? + except ImportError: + sys.path.append(str(Path(__file__).parent.parent / "scripts")) + import models + +MODEL_TABLE = models.MODEL_TABLE +MODELS_TO_RUN = models.MODELS_TO_RUN # ========================= # UTILS (Adapted for Cluster) diff --git a/scripts/models.py b/scripts/models.py new file mode 100644 index 0000000..bcca074 --- /dev/null +++ b/scripts/models.py @@ -0,0 +1,98 @@ +MODEL_TABLE = { + # 1. Llama 3.1 8B Instruct + # MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety. + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + "trust_remote": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + + "google/gemma-3-12b-it": { + "trust_remote": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + # 2. GPT-OSS 20B (MXFP4) + # MAD Row 0 uses 8192. We match this exactly. + "openai/gpt-oss-20b": { + "trust_remote": True, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "8192" + }, + + "openai/gpt-oss-120b": { + "trust_remote": True, + "valid_tp": [1], + "max_num_seqs": "64", + "max_tokens": "8192" + }, + + + "Qwen/Qwen3-14B-AWQ": { + "trust_remote": True, + "valid_tp": [1], # Too big for single GPU + "max_num_seqs": "32", # Lower concurrency for safety + "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive + "enforce_eager": False, + "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error + }, + + # 4. Qwen 30B 4-bit + "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit": { + "trust_remote": True, + "enforce_eager": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + + "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit": { + "trust_remote": True, + "enforce_eager": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + + "zai-org/GLM-4.7-Flash": { + "trust_remote": True, + "enforce_eager": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768", + }, + + # 5. Qwen 80B AWQ + # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB. + # Config: 20k ctx fits in that cache. Eager mode required for stability. + "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { + "trust_remote": True, + "valid_tp": [1], # Too big for single GPU + "max_num_seqs": "32", # Lower concurrency for safety + "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive + "enforce_eager": True, + "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error + }, + +} + +MODELS_TO_RUN = [ + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "google/gemma-3-12b-it", + "Qwen/Qwen3-14B-AWQ", + "openai/gpt-oss-20b", + "openai/gpt-oss-120b", + "zai-org/GLM-4.7-Flash", + "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", +] + +# Hardware / Global Defaults +GPU_UTIL = "0.90" +OFF_NUM_PROMPTS = 200 +OFF_FORCED_OUTPUT = "512" +DEFAULT_BATCH_TOKENS = "8192" diff --git a/scripts/start_vllm.py b/scripts/start_vllm.py index 1388306..864c95b 100644 --- a/scripts/start_vllm.py +++ b/scripts/start_vllm.py @@ -12,16 +12,21 @@ SCRIPT_DIR = Path(__file__).parent.resolve() BENCH_DIR = SCRIPT_DIR.parent / "benchmarks" OPT_DIR = Path("/opt") -# Check /opt first (Container), then local fallback + +# Check /opt first (Container), then local fallback for results file location if (OPT_DIR / "run_vllm_bench.py").exists(): sys.path.append(str(OPT_DIR)) else: sys.path.append(str(BENCH_DIR)) + # Also ensure current script dir is in path for local 'models' import if not already + sys.path.append(str(SCRIPT_DIR)) try: - from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN + import models + MODEL_TABLE = models.MODEL_TABLE + MODELS_TO_RUN = models.MODELS_TO_RUN except ImportError: - print("Error: Could not import run_vllm_bench.py config.") + print("Error: Could not import models.py config.") sys.exit(1) if (OPT_DIR / "max_context_results.json").exists(): diff --git a/scripts/start_vllm_cluster.py b/scripts/start_vllm_cluster.py index 8b28e41..938a582 100755 --- a/scripts/start_vllm_cluster.py +++ b/scripts/start_vllm_cluster.py @@ -13,16 +13,20 @@ SCRIPT_DIR = Path(__file__).parent.resolve() BENCH_DIR = SCRIPT_DIR.parent / "benchmarks" OPT_DIR = Path("/opt") + # Check /opt first (Container), then local fallback if (OPT_DIR / "run_vllm_bench.py").exists(): sys.path.append(str(OPT_DIR)) else: sys.path.append(str(BENCH_DIR)) + sys.path.append(str(SCRIPT_DIR)) try: - from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN + import models + MODEL_TABLE = models.MODEL_TABLE + MODELS_TO_RUN = models.MODELS_TO_RUN except ImportError: - print("Error: Could not import run_vllm_bench.py config.") + print("Error: Could not import models.py config.") sys.exit(1) if (OPT_DIR / "max_context_results.json").exists():