feat: Display environment variables and allow to choose between RoCE/Ethernet and show RCCL debug information

Esse commit está contido em:
Donato Capitella
2026-02-22 20:07:34 +00:00
commit 91b6dbc270
3 arquivos alterados com 205 adições e 44 exclusões
+34 -11
Ver Arquivo
@@ -23,6 +23,8 @@ except ImportError:
# User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2
GPU_UTIL = "0.90"
FORCE_ETH = False
FORCE_DEBUG_NCCL = False
# THROUGHPUT CONFIG (Imported from models.py)
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
@@ -66,6 +68,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
def restart_cluster():
log("Restarting Ray Cluster (Clean State)...")
# Push config to env so cluster_manager picks it up for daemon injection
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
if FORCE_DEBUG_NCCL:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
# 1. Stop Cluster (Best Effort)
cluster_manager.stop_cluster()
@@ -130,13 +141,17 @@ def get_cluster_env():
env["GLOO_SOCKET_IFNAME"] = rdma_iface
# RCCL specific
env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
if FORCE_DEBUG_NCCL:
env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
return env
def get_model_args(model):
@@ -166,7 +181,8 @@ def get_model_args(model):
def get_benchmark_output_file(model, output_dir):
model_safe = model.replace("/", "_")
output_dir_path = Path(output_dir)
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
eth_suffix = "_eth" if FORCE_ETH else ""
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
def run_bench_set(model, backend_name, output_dir, extra_env=None):
output_dir_path = Path(output_dir)
@@ -244,7 +260,9 @@ def run_cluster_throughput(model):
def print_summary():
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
eth_suffix = "_eth" if FORCE_ETH else ""
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 75)
for m in MODELS_TO_RUN:
@@ -252,14 +270,14 @@ def print_summary():
# Default
try:
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
except: val1 = "N/A"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
except: val2 = "N/A"
@@ -269,14 +287,19 @@ def print_summary():
print("-" * 75)
if __name__ == "__main__":
# if not check_ray_status():
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
# sys.exit(1)
# We now handle this by restarting the cluster ourselves.
pass
parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
args = parser.parse_args()
FORCE_ETH = args.eth_only
FORCE_DEBUG_NCCL = args.debug_nccl
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
if FORCE_ETH:
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
if FORCE_DEBUG_NCCL:
log("Note: NCCL Debug mode enabled (Transport Logging).")
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
for m in MODELS_TO_RUN:
+88 -11
Ver Arquivo
@@ -31,34 +31,76 @@ def get_subnet_from_ip(ip):
parts = ip.split('.')
return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
def stop_cluster(nodes=None):
def stop_cluster(worker_ip=None):
"""
Stops Ray on the given nodes (list of IPs).
If nodes is None, does nothing (caller should identify nodes first if needed,
but typically for a clean start we might just rely on 'ray stop' on each setup).
Actually, to be safe, we can try to stop local ray.
Stops Ray locally and on the worker node if provided.
"""
print("Stopping Ray cluster locally...")
subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if worker_ip:
print(f"Stopping Ray cluster on worker ({worker_ip})...")
ssh_cmd = [
"ssh", "-o", "StrictHostKeyChecking=no", worker_ip,
"toolbox", "run", "-c", "vllm", "--", "ray", "stop", "--force"
]
try:
subprocess.run(ssh_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
print(f"Warning: Failed to stop worker node completely: {e}")
def setup_worker_node(worker_ip, head_ip):
subnet = get_subnet_from_ip(worker_ip)
# Script to run on worker
# Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f"""
source /etc/profile
# Silece the kill command
# Silence the kill command
ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Worker Environment ({worker_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={worker_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={worker_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Worker on {worker_ip} connecting to {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Worker is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
"""
@@ -83,20 +125,55 @@ def setup_head_node(head_ip):
print(f"Setting up Head Node ({head_ip})...")
# Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f"""
# Silence the kill command
ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Head Environment ({head_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={head_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={head_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Head on {head_ip}..."
export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Head on {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Head is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
"""
+83 -22
Ver Arquivo
@@ -266,7 +266,6 @@ def configure_and_launch_vllm(model_idx, head_ip):
env["VLLM_HOST_IP"] = head_ip
env["NCCL_SOCKET_IFNAME"] = rdma_iface
env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
# Also need this for Ray backend?
@@ -297,7 +296,20 @@ def configure_and_launch_vllm(model_idx, head_ip):
print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx}")
if use_eager:
print(" Note: Eager Mode Enabled (Recommended for Cluster Stability)")
print(f" Command: {' '.join(cmd)}")
print("\n --- Environment Variables ---")
vars_to_print = [
"RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES",
"VLLM_HOST_IP",
"NCCL_SOCKET_IFNAME",
"NCCL_IB_GID_INDEX",
"NCCL_NET_GDR_LEVEL"
]
for k in vars_to_print:
if k in env:
print(f" export {k}={env[k]}")
print(f"\n Command: {' '.join(cmd)}")
print("="*60 + "\n")
# Exec
@@ -335,21 +347,24 @@ def main():
# Main Menu
# 1. Configure IPs
# 2. Start Cluster (Ray)
# 3. Start VLLM
# 4. Exit
# 3. Stop Ray Cluster
# 4. Ray Cluster Status
# 5. Launch VLLM Serve
# 6. Exit
choice = run_dialog([
"--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
"--title", "Main Menu",
"--menu", "Select Action:", "15", "60", "5",
"--menu", "Select Action:", "16", "60", "6",
"1", f"Configure IPs (Head: {head_ip}, Worker: {worker_ip})",
"2", "Start Ray Cluster",
"3", "Ray Cluster Status",
"4", "Launch VLLM Serve",
"5", "Exit"
"3", "Stop Ray Cluster",
"4", "Ray Cluster Status",
"5", "Launch VLLM Serve",
"6", "Exit"
])
if not choice or choice == "5":
if not choice or choice == "6":
subprocess.run(["clear"])
sys.exit(0)
@@ -359,25 +374,71 @@ def main():
head_ip, worker_ip = res
elif choice == "2":
subprocess.run(["clear"])
print("= Starting Ray Cluster Setup =")
# 1. Start Head
if setup_head_node(head_ip):
print("Head node started successfully. Waiting 5s before worker connection...")
time.sleep(5)
# 2. Start Worker
if setup_worker_node(worker_ip, head_ip):
# 3. Wait for full cluster
wait_for_cluster()
input("Press Enter to continue...")
force_ethernet = False
enable_nccl_debug = False
while True:
eth_status = "YES" if force_ethernet else "NO"
debug_status = "YES" if enable_nccl_debug else "NO"
c_choice = run_dialog([
"--clear", "--backtitle", "AMD VLLM RCCL Cluster Manager",
"--title", "Cluster Network Configuration",
"--menu", "Set Network Parameters before starting Ray:", "15", "65", "3",
"1", f"Force Ethernet (Disable RDMA/RoCE): {eth_status}",
"2", f"Enable NCCL Debug Logging: {debug_status}",
"3", "START CLUSTER"
])
if not c_choice: break
if c_choice == "1":
force_ethernet = not force_ethernet
elif c_choice == "2":
enable_nccl_debug = not enable_nccl_debug
elif c_choice == "3":
os.environ["NCCL_IB_DISABLE"] = "1" if force_ethernet else "0"
if enable_nccl_debug:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
subprocess.run(["clear"])
print("= Starting Ray Cluster Setup =")
# 1. Start Head
if setup_head_node(head_ip):
print("Head node started successfully. Waiting 5s before worker connection...")
time.sleep(5)
# 2. Start Worker
if setup_worker_node(worker_ip, head_ip):
# 3. Wait for full cluster
wait_for_cluster()
input("Press Enter to continue...")
break
elif choice == "3":
subprocess.run(["clear"])
print("= Ray Cluster Status =")
subprocess.run(["ray", "status"])
input("\nPress Enter to continue...")
elif choice == "3":
subprocess.run(["clear"])
print("= Stopping Ray Cluster =")
cluster_manager.stop_cluster(worker_ip)
input("\nPress Enter to continue...")
elif choice == "4":
subprocess.run(["clear"])
print("= Ray Cluster Status =")
res = subprocess.run(["ray", "status"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if res.returncode != 0:
print("\n[!] Cluster is Offline or Unreachable.")
print("Please start the cluster first via Option 2 (Start Ray Cluster).")
else:
print(res.stdout)
input("\nPress Enter to continue...")
elif choice == "5":
# Select Model
menu_items = []
for i, m_id in enumerate(MODELS_TO_RUN):