feat: Display environment variables and allow to choose between RoCE/Ethernet and show RCCL debug information
Esse commit está contido em:
@@ -23,6 +23,8 @@ except ImportError:
|
||||
# User requested specifically to test with TP=2 on the cluster.
|
||||
CLUSTER_TP = 2
|
||||
GPU_UTIL = "0.90"
|
||||
FORCE_ETH = False
|
||||
FORCE_DEBUG_NCCL = False
|
||||
|
||||
# THROUGHPUT CONFIG (Imported from models.py)
|
||||
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
|
||||
@@ -66,6 +68,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
|
||||
def restart_cluster():
|
||||
log("Restarting Ray Cluster (Clean State)...")
|
||||
|
||||
# Push config to env so cluster_manager picks it up for daemon injection
|
||||
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
|
||||
if FORCE_DEBUG_NCCL:
|
||||
os.environ["NCCL_DEBUG"] = "INFO"
|
||||
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
|
||||
else:
|
||||
os.environ.pop("NCCL_DEBUG", None)
|
||||
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
|
||||
|
||||
# 1. Stop Cluster (Best Effort)
|
||||
cluster_manager.stop_cluster()
|
||||
|
||||
@@ -130,13 +141,17 @@ def get_cluster_env():
|
||||
env["GLOO_SOCKET_IFNAME"] = rdma_iface
|
||||
# RCCL specific
|
||||
env["NCCL_IB_GID_INDEX"] = "1"
|
||||
env["NCCL_IB_DISABLE"] = "0"
|
||||
env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
|
||||
env["NCCL_NET_GDR_LEVEL"] = "0"
|
||||
|
||||
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
|
||||
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
|
||||
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
|
||||
|
||||
if FORCE_DEBUG_NCCL:
|
||||
env["NCCL_DEBUG"] = "INFO"
|
||||
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
|
||||
|
||||
return env
|
||||
|
||||
def get_model_args(model):
|
||||
@@ -166,7 +181,8 @@ def get_model_args(model):
|
||||
def get_benchmark_output_file(model, output_dir):
|
||||
model_safe = model.replace("/", "_")
|
||||
output_dir_path = Path(output_dir)
|
||||
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
eth_suffix = "_eth" if FORCE_ETH else ""
|
||||
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
|
||||
|
||||
def run_bench_set(model, backend_name, output_dir, extra_env=None):
|
||||
output_dir_path = Path(output_dir)
|
||||
@@ -244,7 +260,9 @@ def run_cluster_throughput(model):
|
||||
|
||||
|
||||
def print_summary():
|
||||
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
|
||||
eth_suffix = "_eth" if FORCE_ETH else ""
|
||||
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
|
||||
print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 75)
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
@@ -252,14 +270,14 @@ def print_summary():
|
||||
|
||||
# Default
|
||||
try:
|
||||
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
except: val2 = "N/A"
|
||||
@@ -269,14 +287,19 @@ def print_summary():
|
||||
print("-" * 75)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# if not check_ray_status():
|
||||
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
|
||||
# sys.exit(1)
|
||||
# We now handle this by restarting the cluster ourselves.
|
||||
pass
|
||||
parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
|
||||
parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
|
||||
parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
|
||||
args = parser.parse_args()
|
||||
|
||||
FORCE_ETH = args.eth_only
|
||||
FORCE_DEBUG_NCCL = args.debug_nccl
|
||||
|
||||
|
||||
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
|
||||
if FORCE_ETH:
|
||||
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
|
||||
if FORCE_DEBUG_NCCL:
|
||||
log("Note: NCCL Debug mode enabled (Transport Logging).")
|
||||
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
|
||||
@@ -31,34 +31,76 @@ def get_subnet_from_ip(ip):
|
||||
parts = ip.split('.')
|
||||
return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
|
||||
|
||||
def stop_cluster(nodes=None):
|
||||
def stop_cluster(worker_ip=None):
|
||||
"""
|
||||
Stops Ray on the given nodes (list of IPs).
|
||||
If nodes is None, does nothing (caller should identify nodes first if needed,
|
||||
but typically for a clean start we might just rely on 'ray stop' on each setup).
|
||||
Actually, to be safe, we can try to stop local ray.
|
||||
Stops Ray locally and on the worker node if provided.
|
||||
"""
|
||||
print("Stopping Ray cluster locally...")
|
||||
subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if worker_ip:
|
||||
print(f"Stopping Ray cluster on worker ({worker_ip})...")
|
||||
ssh_cmd = [
|
||||
"ssh", "-o", "StrictHostKeyChecking=no", worker_ip,
|
||||
"toolbox", "run", "-c", "vllm", "--", "ray", "stop", "--force"
|
||||
]
|
||||
try:
|
||||
subprocess.run(ssh_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Warning: Failed to stop worker node completely: {e}")
|
||||
|
||||
def setup_worker_node(worker_ip, head_ip):
|
||||
subnet = get_subnet_from_ip(worker_ip)
|
||||
|
||||
# Script to run on worker
|
||||
# Read overrides from current env
|
||||
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
|
||||
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
|
||||
|
||||
script = f"""
|
||||
source /etc/profile
|
||||
# Silece the kill command
|
||||
# Silence the kill command
|
||||
ray stop --force > /dev/null 2>&1 || true
|
||||
|
||||
# Calculate Interface dynamically
|
||||
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
|
||||
echo "\\n--- Ray Worker Environment ({worker_ip}) ---"
|
||||
echo "export RAY_DISABLE_METRICS=1"
|
||||
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
|
||||
echo "export RAY_memory_monitor_refresh_ms=0"
|
||||
echo "export VLLM_HOST_IP={worker_ip}"
|
||||
echo "export RDMA_IFACE=$RDMA_IFACE"
|
||||
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export NCCL_IB_TIMEOUT=23"
|
||||
echo "export NCCL_IB_RETRY_CNT=7"
|
||||
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
|
||||
|
||||
export RAY_DISABLE_METRICS=1
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP={worker_ip}
|
||||
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
export RDMA_IFACE=$RDMA_IFACE
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
# Stability for RDMA
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
|
||||
export NCCL_IB_DISABLE={nccl_disable_val}
|
||||
"""
|
||||
if nccl_debug_val:
|
||||
script += f"""
|
||||
echo "export NCCL_DEBUG={nccl_debug_val}"
|
||||
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
|
||||
export NCCL_DEBUG={nccl_debug_val}
|
||||
export NCCL_DEBUG_SUBSYS=INIT,NET
|
||||
"""
|
||||
|
||||
script += f"""
|
||||
echo "\\nStarting Ray Worker on {worker_ip} connecting to {head_ip}..."
|
||||
if [ "{nccl_disable_val}" = "1" ]; then
|
||||
echo "Note: Worker is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
|
||||
fi
|
||||
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
"""
|
||||
|
||||
@@ -83,20 +125,55 @@ def setup_head_node(head_ip):
|
||||
|
||||
print(f"Setting up Head Node ({head_ip})...")
|
||||
|
||||
# Read overrides from current env
|
||||
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
|
||||
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
|
||||
|
||||
script = f"""
|
||||
# Silence the kill command
|
||||
ray stop --force > /dev/null 2>&1 || true
|
||||
|
||||
# Calculate Interface dynamically
|
||||
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
|
||||
echo "\\n--- Ray Head Environment ({head_ip}) ---"
|
||||
echo "export RAY_DISABLE_METRICS=1"
|
||||
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
|
||||
echo "export RAY_memory_monitor_refresh_ms=0"
|
||||
echo "export VLLM_HOST_IP={head_ip}"
|
||||
echo "export RDMA_IFACE=$RDMA_IFACE"
|
||||
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export NCCL_IB_TIMEOUT=23"
|
||||
echo "export NCCL_IB_RETRY_CNT=7"
|
||||
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
|
||||
|
||||
export RAY_DISABLE_METRICS=1
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP={head_ip}
|
||||
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
export RDMA_IFACE=$RDMA_IFACE
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
# Stability for RDMA
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Head on {head_ip}..."
|
||||
export NCCL_IB_DISABLE={nccl_disable_val}
|
||||
"""
|
||||
|
||||
if nccl_debug_val:
|
||||
script += f"""
|
||||
echo "export NCCL_DEBUG={nccl_debug_val}"
|
||||
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
|
||||
export NCCL_DEBUG={nccl_debug_val}
|
||||
export NCCL_DEBUG_SUBSYS=INIT,NET
|
||||
"""
|
||||
|
||||
script += f"""
|
||||
echo "\\nStarting Ray Head on {head_ip}..."
|
||||
if [ "{nccl_disable_val}" = "1" ]; then
|
||||
echo "Note: Head is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
|
||||
fi
|
||||
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
|
||||
"""
|
||||
|
||||
|
||||
@@ -266,7 +266,6 @@ def configure_and_launch_vllm(model_idx, head_ip):
|
||||
env["VLLM_HOST_IP"] = head_ip
|
||||
env["NCCL_SOCKET_IFNAME"] = rdma_iface
|
||||
env["NCCL_IB_GID_INDEX"] = "1"
|
||||
env["NCCL_IB_DISABLE"] = "0"
|
||||
env["NCCL_NET_GDR_LEVEL"] = "0"
|
||||
|
||||
# Also need this for Ray backend?
|
||||
@@ -297,7 +296,20 @@ def configure_and_launch_vllm(model_idx, head_ip):
|
||||
print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx}")
|
||||
if use_eager:
|
||||
print(" Note: Eager Mode Enabled (Recommended for Cluster Stability)")
|
||||
print(f" Command: {' '.join(cmd)}")
|
||||
|
||||
print("\n --- Environment Variables ---")
|
||||
vars_to_print = [
|
||||
"RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES",
|
||||
"VLLM_HOST_IP",
|
||||
"NCCL_SOCKET_IFNAME",
|
||||
"NCCL_IB_GID_INDEX",
|
||||
"NCCL_NET_GDR_LEVEL"
|
||||
]
|
||||
for k in vars_to_print:
|
||||
if k in env:
|
||||
print(f" export {k}={env[k]}")
|
||||
|
||||
print(f"\n Command: {' '.join(cmd)}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Exec
|
||||
@@ -335,21 +347,24 @@ def main():
|
||||
# Main Menu
|
||||
# 1. Configure IPs
|
||||
# 2. Start Cluster (Ray)
|
||||
# 3. Start VLLM
|
||||
# 4. Exit
|
||||
# 3. Stop Ray Cluster
|
||||
# 4. Ray Cluster Status
|
||||
# 5. Launch VLLM Serve
|
||||
# 6. Exit
|
||||
|
||||
choice = run_dialog([
|
||||
"--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
|
||||
"--title", "Main Menu",
|
||||
"--menu", "Select Action:", "15", "60", "5",
|
||||
"--menu", "Select Action:", "16", "60", "6",
|
||||
"1", f"Configure IPs (Head: {head_ip}, Worker: {worker_ip})",
|
||||
"2", "Start Ray Cluster",
|
||||
"3", "Ray Cluster Status",
|
||||
"4", "Launch VLLM Serve",
|
||||
"5", "Exit"
|
||||
"3", "Stop Ray Cluster",
|
||||
"4", "Ray Cluster Status",
|
||||
"5", "Launch VLLM Serve",
|
||||
"6", "Exit"
|
||||
])
|
||||
|
||||
if not choice or choice == "5":
|
||||
if not choice or choice == "6":
|
||||
subprocess.run(["clear"])
|
||||
sys.exit(0)
|
||||
|
||||
@@ -359,25 +374,71 @@ def main():
|
||||
head_ip, worker_ip = res
|
||||
|
||||
elif choice == "2":
|
||||
subprocess.run(["clear"])
|
||||
print("= Starting Ray Cluster Setup =")
|
||||
# 1. Start Head
|
||||
if setup_head_node(head_ip):
|
||||
print("Head node started successfully. Waiting 5s before worker connection...")
|
||||
time.sleep(5)
|
||||
# 2. Start Worker
|
||||
if setup_worker_node(worker_ip, head_ip):
|
||||
# 3. Wait for full cluster
|
||||
wait_for_cluster()
|
||||
input("Press Enter to continue...")
|
||||
force_ethernet = False
|
||||
enable_nccl_debug = False
|
||||
|
||||
while True:
|
||||
eth_status = "YES" if force_ethernet else "NO"
|
||||
debug_status = "YES" if enable_nccl_debug else "NO"
|
||||
|
||||
c_choice = run_dialog([
|
||||
"--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
|
||||
"--title", "Cluster Network Configuration",
|
||||
"--menu", "Set Network Parameters before starting Ray:", "15", "65", "3",
|
||||
"1", f"Force Ethernet (Disable RDMA/RoCE): {eth_status}",
|
||||
"2", f"Enable NCCL Debug Logging: {debug_status}",
|
||||
"3", "START CLUSTER"
|
||||
])
|
||||
if not c_choice: break
|
||||
|
||||
if c_choice == "1":
|
||||
force_ethernet = not force_ethernet
|
||||
elif c_choice == "2":
|
||||
enable_nccl_debug = not enable_nccl_debug
|
||||
elif c_choice == "3":
|
||||
os.environ["NCCL_IB_DISABLE"] = "1" if force_ethernet else "0"
|
||||
if enable_nccl_debug:
|
||||
os.environ["NCCL_DEBUG"] = "INFO"
|
||||
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
|
||||
else:
|
||||
os.environ.pop("NCCL_DEBUG", None)
|
||||
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
|
||||
|
||||
subprocess.run(["clear"])
|
||||
print("= Starting Ray Cluster Setup =")
|
||||
# 1. Start Head
|
||||
if setup_head_node(head_ip):
|
||||
print("Head node started successfully. Waiting 5s before worker connection...")
|
||||
time.sleep(5)
|
||||
# 2. Start Worker
|
||||
if setup_worker_node(worker_ip, head_ip):
|
||||
# 3. Wait for full cluster
|
||||
wait_for_cluster()
|
||||
input("Press Enter to continue...")
|
||||
break
|
||||
|
||||
elif choice == "3":
|
||||
subprocess.run(["clear"])
|
||||
print("= Ray Cluster Status =")
|
||||
subprocess.run(["ray", "status"])
|
||||
input("\nPress Enter to continue...")
|
||||
|
||||
elif choice == "3":
|
||||
subprocess.run(["clear"])
|
||||
print("= Stopping Ray Cluster =")
|
||||
cluster_manager.stop_cluster(worker_ip)
|
||||
input("\nPress Enter to continue...")
|
||||
|
||||
elif choice == "4":
|
||||
subprocess.run(["clear"])
|
||||
print("= Ray Cluster Status =")
|
||||
res = subprocess.run(["ray", "status"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
if res.returncode != 0:
|
||||
print("\n[!] Cluster is Offline or Unreachable.")
|
||||
print("Please start the cluster first via Option 2 (Start Ray Cluster).")
|
||||
else:
|
||||
print(res.stdout)
|
||||
input("\nPress Enter to continue...")
|
||||
|
||||
elif choice == "5":
|
||||
# Select Model
|
||||
menu_items = []
|
||||
for i, m_id in enumerate(MODELS_TO_RUN):
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário