improve benchmarks

Dieser Commit ist enthalten in:
Donato Capitella
2026-02-25 09:29:46 +00:00
Ursprung a5a7b8fe04
Commit 6875f62ccf
6 geänderte Dateien mit 260 neuen und 42 gelöschten Zeilen
+7 -3
Datei anzeigen
@@ -2,13 +2,17 @@ import subprocess
import time
import os
def get_net_iface(ip_prefix="192.168.100"):
def get_net_iface(ip_prefix=None):
"""
Auto-detects the interface that serves the cluster network.
Assumes standard 192.168.100.x setup from start_vllm_cluster.py
Assumes standard 192.168.100.x setup from start_vllm_cluster.py, but parameterizable.
"""
if ip_prefix is None:
head_ip = os.getenv("VLLM_HEAD_IP", "192.168.100.1")
ip_prefix = ".".join(head_ip.split('.')[:3])
try:
# ip -o addr show | grep 192.168.100
# ip -o addr show | grep <ip_prefix>
cmd = f"ip -o addr show | grep {ip_prefix}"
res = subprocess.check_output(cmd, shell=True, text=True).strip()
# Output format: 2: eth0 inet 192.168.100.1/24 ...
+6 -5
Datei anzeigen
@@ -96,12 +96,13 @@ def check_ray_status():
def wait_for_cluster():
return cluster_manager.wait_for_cluster()
def nuke_vllm_cache():
def nuke_vllm_cache(head_ip):
# Only nukes local cache on the head node for now, or use cluster nuke?
# The original script just did local nuke.
# cluster_manager has nuke_vllm_cache_on_node and nuke_vllm_cache_cluster
# Let's use the local ip one effectively
rdma = cluster_manager.get_net_iface()
prefix = ".".join(head_ip.split('.')[:3])
rdma = cluster_manager.get_net_iface(prefix)
local = cluster_manager.get_local_ip(rdma)
cluster_manager.nuke_vllm_cache_on_node(local, is_local=True)
@@ -244,7 +245,7 @@ def configure_and_launch_vllm(model_idx, head_ip):
subprocess.run(["clear"])
if clear_cache:
nuke_vllm_cache()
nuke_vllm_cache(head_ip)
# Environment Setup
# We need to set these variables in the current process before exec or pass them in env
@@ -340,8 +341,8 @@ def main():
check_dependencies()
# Default IPs
head_ip = "192.168.100.1"
worker_ip = "192.168.100.2"
head_ip = os.getenv("VLLM_HEAD_IP", "192.168.100.1")
worker_ip = os.getenv("VLLM_WORKER_IP", "192.168.100.2")
while True:
# Main Menu