feat: centralize model configurations and benchmark settings into a new models.py module and update Dockerfile and scripts to use it.
This commit is contained in:
+7
-3
@@ -125,8 +125,9 @@ RUN chmod -R a+rwX /opt && \
|
|||||||
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
|
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
|
||||||
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
|
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
|
||||||
COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh
|
COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh
|
||||||
COPY scripts/start_vllm.py /usr/local/bin/start-vllm
|
COPY scripts/start_vllm.py /opt/start-vllm
|
||||||
COPY scripts/start_vllm_cluster.py /usr/local/bin/start-vllm-cluster
|
COPY scripts/start_vllm_cluster.py /opt/start-vllm-cluster
|
||||||
|
COPY scripts/models.py /opt/models.py
|
||||||
COPY benchmarks/max_context_results.json /opt/max_context_results.json
|
COPY benchmarks/max_context_results.json /opt/max_context_results.json
|
||||||
COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py
|
COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py
|
||||||
COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py
|
COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py
|
||||||
@@ -134,7 +135,10 @@ COPY benchmarks/find_max_context.py /opt/find_max_context.py
|
|||||||
COPY rdma_cluster/compare_eth_vs_rdma.sh /opt/compare_eth_vs_rdma.sh
|
COPY rdma_cluster/compare_eth_vs_rdma.sh /opt/compare_eth_vs_rdma.sh
|
||||||
COPY scripts/configure_cluster.sh /opt/configure_cluster.sh
|
COPY scripts/configure_cluster.sh /opt/configure_cluster.sh
|
||||||
RUN chmod +x /opt/configure_cluster.sh
|
RUN chmod +x /opt/configure_cluster.sh
|
||||||
RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod +x /usr/local/bin/start-vllm-cluster && chmod +x /opt/vllm_cluster_bench.py && chmod +x /opt/compare_eth_vs_rdma.sh && chmod +x /opt/find_max_context.py && chmod 0644 /opt/max_context_results.json
|
RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py /opt/compare_eth_vs_rdma.sh /opt/find_max_context.py /opt/run_vllm_bench.py && \
|
||||||
|
ln -s /opt/start-vllm /usr/local/bin/start-vllm && \
|
||||||
|
ln -s /opt/start-vllm-cluster /usr/local/bin/start-vllm-cluster && \
|
||||||
|
chmod 0644 /etc/profile.d/*.sh /opt/max_context_results.json /opt/models.py
|
||||||
RUN chmod 0644 /etc/profile.d/*.sh
|
RUN chmod 0644 /etc/profile.d/*.sh
|
||||||
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
||||||
|
|
||||||
|
|||||||
@@ -2,17 +2,32 @@
|
|||||||
import subprocess, time, json, sys, os, requests, argparse
|
import subprocess, time, json, sys, os, requests, argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# ⚙️ GLOBAL SETTINGS
|
# ⚙️ GLOBAL SETTINGS
|
||||||
# =========================
|
# =========================
|
||||||
|
|
||||||
# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5)
|
try:
|
||||||
GPU_UTIL = "0.90"
|
import models
|
||||||
# 1. THROUGHPUT CONFIG
|
except ImportError:
|
||||||
OFF_NUM_PROMPTS = 200
|
# If running locally and models.py is in ../scripts?
|
||||||
OFF_FORCED_OUTPUT = "512"
|
# Or if running in /opt where models.py is alongside.
|
||||||
# Default fallback if not specified in MODEL_TABLE
|
# We will try adding current dir to path just in case
|
||||||
DEFAULT_BATCH_TOKENS = "8192"
|
sys.path.append(os.getcwd())
|
||||||
|
try:
|
||||||
|
import models
|
||||||
|
except ImportError:
|
||||||
|
# Fallback for local structure: assuming this is in benchmarks/ and models is in scripts/
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
|
||||||
|
import models
|
||||||
|
|
||||||
|
# Import from shared config
|
||||||
|
MODEL_TABLE = models.MODEL_TABLE
|
||||||
|
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||||
|
GPU_UTIL = models.GPU_UTIL
|
||||||
|
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
|
||||||
|
OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT
|
||||||
|
DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS
|
||||||
|
|
||||||
# Fallbacks
|
# Fallbacks
|
||||||
FALLBACK_INPUT_LEN = 1024
|
FALLBACK_INPUT_LEN = 1024
|
||||||
@@ -21,84 +36,6 @@ FALLBACK_OUTPUT_LEN = 512
|
|||||||
RESULTS_DIR = Path("benchmark_results")
|
RESULTS_DIR = Path("benchmark_results")
|
||||||
RESULTS_DIR.mkdir(exist_ok=True)
|
RESULTS_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# =========================
|
|
||||||
# 🛠️ MODEL CONFIGURATION 🛠️
|
|
||||||
# =========================
|
|
||||||
|
|
||||||
MODEL_TABLE = {
|
|
||||||
# 1. Llama 3.1 8B Instruct
|
|
||||||
# MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
|
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
|
|
||||||
"trust_remote": False,
|
|
||||||
"valid_tp": [1, 2],
|
|
||||||
"max_num_seqs": "64",
|
|
||||||
"max_tokens": "32768"
|
|
||||||
},
|
|
||||||
|
|
||||||
"google/gemma-3-12b-it": {
|
|
||||||
"trust_remote": False,
|
|
||||||
"valid_tp": [1, 2],
|
|
||||||
"max_num_seqs": "64",
|
|
||||||
"max_tokens": "32768"
|
|
||||||
},
|
|
||||||
# 2. GPT-OSS 20B (MXFP4)
|
|
||||||
# MAD Row 0 uses 8192. We match this exactly.
|
|
||||||
"openai/gpt-oss-20b": {
|
|
||||||
"trust_remote": True,
|
|
||||||
"valid_tp": [1, 2],
|
|
||||||
"max_num_seqs": "64",
|
|
||||||
"max_tokens": "8192"
|
|
||||||
},
|
|
||||||
|
|
||||||
"openai/gpt-oss-120b": {
|
|
||||||
"trust_remote": True,
|
|
||||||
"valid_tp": [1],
|
|
||||||
"max_num_seqs": "64",
|
|
||||||
"max_tokens": "8192"
|
|
||||||
},
|
|
||||||
|
|
||||||
|
|
||||||
"Qwen/Qwen3-14B-AWQ": {
|
|
||||||
"trust_remote": True,
|
|
||||||
"valid_tp": [1], # Too big for single GPU
|
|
||||||
"max_num_seqs": "32", # Lower concurrency for safety
|
|
||||||
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
|
||||||
"enforce_eager": False,
|
|
||||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
|
||||||
},
|
|
||||||
|
|
||||||
# 4. Qwen 30B 4-bit
|
|
||||||
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": {
|
|
||||||
"trust_remote": True,
|
|
||||||
"enforce_eager": False,
|
|
||||||
"valid_tp": [1, 2],
|
|
||||||
"max_num_seqs": "64",
|
|
||||||
"max_tokens": "32768"
|
|
||||||
},
|
|
||||||
|
|
||||||
# 5. Qwen 80B AWQ
|
|
||||||
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
|
|
||||||
# Config: 20k ctx fits in that cache. Eager mode required for stability.
|
|
||||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
|
||||||
"trust_remote": True,
|
|
||||||
"valid_tp": [1], # Too big for single GPU
|
|
||||||
"max_num_seqs": "32", # Lower concurrency for safety
|
|
||||||
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
|
||||||
"enforce_eager": True,
|
|
||||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
|
||||||
},
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
MODELS_TO_RUN = [
|
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"google/gemma-3-12b-it",
|
|
||||||
"Qwen/Qwen3-14B-AWQ",
|
|
||||||
"openai/gpt-oss-20b",
|
|
||||||
"openai/gpt-oss-120b",
|
|
||||||
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
|
|
||||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
|
|
||||||
]
|
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# UTILS
|
# UTILS
|
||||||
|
|||||||
@@ -25,12 +25,21 @@ RESULTS_DIR.mkdir(exist_ok=True)
|
|||||||
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
|
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
|
||||||
# Let's assume it's in the same dir as run_vllm_bench.py.
|
# Let's assume it's in the same dir as run_vllm_bench.py.
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
|
import models
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Fallback if run directly and path issues
|
# If in /opt, this should work if path includes ., otherwise:
|
||||||
sys.path.append(os.path.dirname(__file__))
|
sys.path.append(os.getcwd())
|
||||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
|
try:
|
||||||
|
import models
|
||||||
|
# Also try parent/scripts for local dev if above failed?
|
||||||
|
except ImportError:
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
|
||||||
|
import models
|
||||||
|
|
||||||
|
MODEL_TABLE = models.MODEL_TABLE
|
||||||
|
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# UTILS (Adapted for Cluster)
|
# UTILS (Adapted for Cluster)
|
||||||
|
|||||||
@@ -0,0 +1,98 @@
|
|||||||
|
MODEL_TABLE = {
|
||||||
|
# 1. Llama 3.1 8B Instruct
|
||||||
|
# MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
|
||||||
|
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
|
||||||
|
"trust_remote": False,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "32768"
|
||||||
|
},
|
||||||
|
|
||||||
|
"google/gemma-3-12b-it": {
|
||||||
|
"trust_remote": False,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "32768"
|
||||||
|
},
|
||||||
|
# 2. GPT-OSS 20B (MXFP4)
|
||||||
|
# MAD Row 0 uses 8192. We match this exactly.
|
||||||
|
"openai/gpt-oss-20b": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "8192"
|
||||||
|
},
|
||||||
|
|
||||||
|
"openai/gpt-oss-120b": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"valid_tp": [1],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "8192"
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
"Qwen/Qwen3-14B-AWQ": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"valid_tp": [1], # Too big for single GPU
|
||||||
|
"max_num_seqs": "32", # Lower concurrency for safety
|
||||||
|
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
||||||
|
"enforce_eager": False,
|
||||||
|
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||||
|
},
|
||||||
|
|
||||||
|
# 4. Qwen 30B 4-bit
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"enforce_eager": False,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "32768"
|
||||||
|
},
|
||||||
|
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"enforce_eager": False,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "32768"
|
||||||
|
},
|
||||||
|
|
||||||
|
"zai-org/GLM-4.7-Flash": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"enforce_eager": False,
|
||||||
|
"valid_tp": [1, 2],
|
||||||
|
"max_num_seqs": "64",
|
||||||
|
"max_tokens": "32768",
|
||||||
|
},
|
||||||
|
|
||||||
|
# 5. Qwen 80B AWQ
|
||||||
|
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
|
||||||
|
# Config: 20k ctx fits in that cache. Eager mode required for stability.
|
||||||
|
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
||||||
|
"trust_remote": True,
|
||||||
|
"valid_tp": [1], # Too big for single GPU
|
||||||
|
"max_num_seqs": "32", # Lower concurrency for safety
|
||||||
|
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
||||||
|
"enforce_eager": True,
|
||||||
|
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||||
|
},
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
MODELS_TO_RUN = [
|
||||||
|
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"google/gemma-3-12b-it",
|
||||||
|
"Qwen/Qwen3-14B-AWQ",
|
||||||
|
"openai/gpt-oss-20b",
|
||||||
|
"openai/gpt-oss-120b",
|
||||||
|
"zai-org/GLM-4.7-Flash",
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||||
|
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||||
|
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Hardware / Global Defaults
|
||||||
|
GPU_UTIL = "0.90"
|
||||||
|
OFF_NUM_PROMPTS = 200
|
||||||
|
OFF_FORCED_OUTPUT = "512"
|
||||||
|
DEFAULT_BATCH_TOKENS = "8192"
|
||||||
@@ -12,16 +12,21 @@ SCRIPT_DIR = Path(__file__).parent.resolve()
|
|||||||
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
|
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
|
||||||
OPT_DIR = Path("/opt")
|
OPT_DIR = Path("/opt")
|
||||||
|
|
||||||
# Check /opt first (Container), then local fallback
|
|
||||||
|
# Check /opt first (Container), then local fallback for results file location
|
||||||
if (OPT_DIR / "run_vllm_bench.py").exists():
|
if (OPT_DIR / "run_vllm_bench.py").exists():
|
||||||
sys.path.append(str(OPT_DIR))
|
sys.path.append(str(OPT_DIR))
|
||||||
else:
|
else:
|
||||||
sys.path.append(str(BENCH_DIR))
|
sys.path.append(str(BENCH_DIR))
|
||||||
|
# Also ensure current script dir is in path for local 'models' import if not already
|
||||||
|
sys.path.append(str(SCRIPT_DIR))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
|
import models
|
||||||
|
MODEL_TABLE = models.MODEL_TABLE
|
||||||
|
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Error: Could not import run_vllm_bench.py config.")
|
print("Error: Could not import models.py config.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if (OPT_DIR / "max_context_results.json").exists():
|
if (OPT_DIR / "max_context_results.json").exists():
|
||||||
|
|||||||
@@ -13,16 +13,20 @@ SCRIPT_DIR = Path(__file__).parent.resolve()
|
|||||||
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
|
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
|
||||||
OPT_DIR = Path("/opt")
|
OPT_DIR = Path("/opt")
|
||||||
|
|
||||||
|
|
||||||
# Check /opt first (Container), then local fallback
|
# Check /opt first (Container), then local fallback
|
||||||
if (OPT_DIR / "run_vllm_bench.py").exists():
|
if (OPT_DIR / "run_vllm_bench.py").exists():
|
||||||
sys.path.append(str(OPT_DIR))
|
sys.path.append(str(OPT_DIR))
|
||||||
else:
|
else:
|
||||||
sys.path.append(str(BENCH_DIR))
|
sys.path.append(str(BENCH_DIR))
|
||||||
|
sys.path.append(str(SCRIPT_DIR))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
|
import models
|
||||||
|
MODEL_TABLE = models.MODEL_TABLE
|
||||||
|
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Error: Could not import run_vllm_bench.py config.")
|
print("Error: Could not import models.py config.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if (OPT_DIR / "max_context_results.json").exists():
|
if (OPT_DIR / "max_context_results.json").exists():
|
||||||
|
|||||||
Viittaa uudesa ongelmassa
Block a user