feat: Display environment variables and allow to choose between RoCE/Ethernet and show RCCL debug information

このコミットが含まれているのは:
Donato Capitella
2026-02-22 20:07:34 +00:00
コミット 91b6dbc270
3個のファイルの変更205行の追加44行の削除
+34 -11
ファイルの表示
@@ -23,6 +23,8 @@ except ImportError:
# User requested specifically to test with TP=2 on the cluster. # User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2 CLUSTER_TP = 2
GPU_UTIL = "0.90" GPU_UTIL = "0.90"
FORCE_ETH = False
FORCE_DEBUG_NCCL = False
# THROUGHPUT CONFIG (Imported from models.py) # THROUGHPUT CONFIG (Imported from models.py)
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
@@ -66,6 +68,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
def restart_cluster(): def restart_cluster():
log("Restarting Ray Cluster (Clean State)...") log("Restarting Ray Cluster (Clean State)...")
# Push config to env so cluster_manager picks it up for daemon injection
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
if FORCE_DEBUG_NCCL:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
# 1. Stop Cluster (Best Effort) # 1. Stop Cluster (Best Effort)
cluster_manager.stop_cluster() cluster_manager.stop_cluster()
@@ -130,13 +141,17 @@ def get_cluster_env():
env["GLOO_SOCKET_IFNAME"] = rdma_iface env["GLOO_SOCKET_IFNAME"] = rdma_iface
# RCCL specific # RCCL specific
env["NCCL_IB_GID_INDEX"] = "1" env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0" env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
env["NCCL_NET_GDR_LEVEL"] = "0" env["NCCL_NET_GDR_LEVEL"] = "0"
# Stability for RDMA (Fix for high-throughput models like Gemma 3) # Stability for RDMA (Fix for high-throughput models like Gemma 3)
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s) env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
if FORCE_DEBUG_NCCL:
env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
return env return env
def get_model_args(model): def get_model_args(model):
@@ -166,7 +181,8 @@ def get_model_args(model):
def get_benchmark_output_file(model, output_dir): def get_benchmark_output_file(model, output_dir):
model_safe = model.replace("/", "_") model_safe = model.replace("/", "_")
output_dir_path = Path(output_dir) output_dir_path = Path(output_dir)
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json" eth_suffix = "_eth" if FORCE_ETH else ""
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
def run_bench_set(model, backend_name, output_dir, extra_env=None): def run_bench_set(model, backend_name, output_dir, extra_env=None):
output_dir_path = Path(output_dir) output_dir_path = Path(output_dir)
@@ -244,7 +260,9 @@ def run_cluster_throughput(model):
def print_summary(): def print_summary():
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}") eth_suffix = "_eth" if FORCE_ETH else ""
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 75) print("-" * 75)
for m in MODELS_TO_RUN: for m in MODELS_TO_RUN:
@@ -252,14 +270,14 @@ def print_summary():
# Default # Default
try: try:
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json" p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d1 = json.loads(p1.read_text()) d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}" val1 = f"{d1.get('tokens_per_second', 0):.1f}"
except: val1 = "N/A" except: val1 = "N/A"
# ROCm # ROCm
try: try:
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json" p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d2 = json.loads(p2.read_text()) d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}" val2 = f"{d2.get('tokens_per_second', 0):.1f}"
except: val2 = "N/A" except: val2 = "N/A"
@@ -269,14 +287,19 @@ def print_summary():
print("-" * 75) print("-" * 75)
if __name__ == "__main__": if __name__ == "__main__":
# if not check_ray_status(): parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.") parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
# sys.exit(1) parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
# We now handle this by restarting the cluster ourselves. args = parser.parse_args()
pass
FORCE_ETH = args.eth_only
FORCE_DEBUG_NCCL = args.debug_nccl
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...") log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
if FORCE_ETH:
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
if FORCE_DEBUG_NCCL:
log("Note: NCCL Debug mode enabled (Transport Logging).")
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.") log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
for m in MODELS_TO_RUN: for m in MODELS_TO_RUN:
+88 -11
ファイルの表示
@@ -31,34 +31,76 @@ def get_subnet_from_ip(ip):
parts = ip.split('.') parts = ip.split('.')
return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24" return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
def stop_cluster(nodes=None): def stop_cluster(worker_ip=None):
""" """
Stops Ray on the given nodes (list of IPs). Stops Ray locally and on the worker node if provided.
If nodes is None, does nothing (caller should identify nodes first if needed,
but typically for a clean start we might just rely on 'ray stop' on each setup).
Actually, to be safe, we can try to stop local ray.
""" """
print("Stopping Ray cluster locally...")
subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if worker_ip:
print(f"Stopping Ray cluster on worker ({worker_ip})...")
ssh_cmd = [
"ssh", "-o", "StrictHostKeyChecking=no", worker_ip,
"toolbox", "run", "-c", "vllm", "--", "ray", "stop", "--force"
]
try:
subprocess.run(ssh_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
print(f"Warning: Failed to stop worker node completely: {e}")
def setup_worker_node(worker_ip, head_ip): def setup_worker_node(worker_ip, head_ip):
subnet = get_subnet_from_ip(worker_ip) subnet = get_subnet_from_ip(worker_ip)
# Script to run on worker # Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f""" script = f"""
source /etc/profile source /etc/profile
# Silece the kill command # Silence the kill command
ray stop --force > /dev/null 2>&1 || true ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Worker Environment ({worker_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={worker_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1 export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0 export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={worker_ip} export VLLM_HOST_IP={worker_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1) export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA # Stability for RDMA
export NCCL_IB_TIMEOUT=23 export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7 export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..." export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Worker on {worker_ip} connecting to {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Worker is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
""" """
@@ -83,20 +125,55 @@ def setup_head_node(head_ip):
print(f"Setting up Head Node ({head_ip})...") print(f"Setting up Head Node ({head_ip})...")
# Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f""" script = f"""
# Silence the kill command # Silence the kill command
ray stop --force > /dev/null 2>&1 || true ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Head Environment ({head_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={head_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1 export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0 export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={head_ip} export VLLM_HOST_IP={head_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1) export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA # Stability for RDMA
export NCCL_IB_TIMEOUT=23 export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7 export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Head on {head_ip}..." export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Head on {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Head is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
""" """
+83 -22
ファイルの表示
@@ -266,7 +266,6 @@ def configure_and_launch_vllm(model_idx, head_ip):
env["VLLM_HOST_IP"] = head_ip env["VLLM_HOST_IP"] = head_ip
env["NCCL_SOCKET_IFNAME"] = rdma_iface env["NCCL_SOCKET_IFNAME"] = rdma_iface
env["NCCL_IB_GID_INDEX"] = "1" env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_NET_GDR_LEVEL"] = "0" env["NCCL_NET_GDR_LEVEL"] = "0"
# Also need this for Ray backend? # Also need this for Ray backend?
@@ -297,7 +296,20 @@ def configure_and_launch_vllm(model_idx, head_ip):
print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx}") print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx}")
if use_eager: if use_eager:
print(" Note: Eager Mode Enabled (Recommended for Cluster Stability)") print(" Note: Eager Mode Enabled (Recommended for Cluster Stability)")
print(f" Command: {' '.join(cmd)}")
print("\n --- Environment Variables ---")
vars_to_print = [
"RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES",
"VLLM_HOST_IP",
"NCCL_SOCKET_IFNAME",
"NCCL_IB_GID_INDEX",
"NCCL_NET_GDR_LEVEL"
]
for k in vars_to_print:
if k in env:
print(f" export {k}={env[k]}")
print(f"\n Command: {' '.join(cmd)}")
print("="*60 + "\n") print("="*60 + "\n")
# Exec # Exec
@@ -335,21 +347,24 @@ def main():
# Main Menu # Main Menu
# 1. Configure IPs # 1. Configure IPs
# 2. Start Cluster (Ray) # 2. Start Cluster (Ray)
# 3. Start VLLM # 3. Stop Ray Cluster
# 4. Exit # 4. Ray Cluster Status
# 5. Launch VLLM Serve
# 6. Exit
choice = run_dialog([ choice = run_dialog([
"--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager", "--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
"--title", "Main Menu", "--title", "Main Menu",
"--menu", "Select Action:", "15", "60", "5", "--menu", "Select Action:", "16", "60", "6",
"1", f"Configure IPs (Head: {head_ip}, Worker: {worker_ip})", "1", f"Configure IPs (Head: {head_ip}, Worker: {worker_ip})",
"2", "Start Ray Cluster", "2", "Start Ray Cluster",
"3", "Ray Cluster Status", "3", "Stop Ray Cluster",
"4", "Launch VLLM Serve", "4", "Ray Cluster Status",
"5", "Exit" "5", "Launch VLLM Serve",
"6", "Exit"
]) ])
if not choice or choice == "5": if not choice or choice == "6":
subprocess.run(["clear"]) subprocess.run(["clear"])
sys.exit(0) sys.exit(0)
@@ -359,25 +374,71 @@ def main():
head_ip, worker_ip = res head_ip, worker_ip = res
elif choice == "2": elif choice == "2":
subprocess.run(["clear"]) force_ethernet = False
print("= Starting Ray Cluster Setup =") enable_nccl_debug = False
# 1. Start Head
if setup_head_node(head_ip): while True:
print("Head node started successfully. Waiting 5s before worker connection...") eth_status = "YES" if force_ethernet else "NO"
time.sleep(5) debug_status = "YES" if enable_nccl_debug else "NO"
# 2. Start Worker
if setup_worker_node(worker_ip, head_ip): c_choice = run_dialog([
# 3. Wait for full cluster "--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
wait_for_cluster() "--title", "Cluster Network Configuration",
input("Press Enter to continue...") "--menu", "Set Network Parameters before starting Ray:", "15", "65", "3",
"1", f"Force Ethernet (Disable RDMA/RoCE): {eth_status}",
"2", f"Enable NCCL Debug Logging: {debug_status}",
"3", "START CLUSTER"
])
if not c_choice: break
if c_choice == "1":
force_ethernet = not force_ethernet
elif c_choice == "2":
enable_nccl_debug = not enable_nccl_debug
elif c_choice == "3":
os.environ["NCCL_IB_DISABLE"] = "1" if force_ethernet else "0"
if enable_nccl_debug:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
subprocess.run(["clear"])
print("= Starting Ray Cluster Setup =")
# 1. Start Head
if setup_head_node(head_ip):
print("Head node started successfully. Waiting 5s before worker connection...")
time.sleep(5)
# 2. Start Worker
if setup_worker_node(worker_ip, head_ip):
# 3. Wait for full cluster
wait_for_cluster()
input("Press Enter to continue...")
break
elif choice == "3":
subprocess.run(["clear"])
print("= Ray Cluster Status =") print("= Ray Cluster Status =")
subprocess.run(["ray", "status"]) subprocess.run(["ray", "status"])
input("\nPress Enter to continue...") input("\nPress Enter to continue...")
elif choice == "3":
subprocess.run(["clear"])
print("= Stopping Ray Cluster =")
cluster_manager.stop_cluster(worker_ip)
input("\nPress Enter to continue...")
elif choice == "4": elif choice == "4":
subprocess.run(["clear"])
print("= Ray Cluster Status =")
res = subprocess.run(["ray", "status"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if res.returncode != 0:
print("\n[!] Cluster is Offline or Unreachable.")
print("Please start the cluster first via Option 2 (Start Ray Cluster).")
else:
print(res.stdout)
input("\nPress Enter to continue...")
elif choice == "5":
# Select Model # Select Model
menu_items = [] menu_items = []
for i, m_id in enumerate(MODELS_TO_RUN): for i, m_id in enumerate(MODELS_TO_RUN):