feat: Display environment variables and allow to choose between RoCE/Ethernet and show RCCL debug information

This commit is contained in:
Donato Capitella
2026-02-22 20:07:34 +00:00
orang tua 4a5d6c7855
melakukan 91b6dbc270
3 mengubah file dengan 205 tambahan dan 44 penghapusan
+34 -11
Melihat File
@@ -23,6 +23,8 @@ except ImportError:
# User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2
GPU_UTIL = "0.90"
FORCE_ETH = False
FORCE_DEBUG_NCCL = False
# THROUGHPUT CONFIG (Imported from models.py)
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
@@ -66,6 +68,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
def restart_cluster():
log("Restarting Ray Cluster (Clean State)...")
# Push config to env so cluster_manager picks it up for daemon injection
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
if FORCE_DEBUG_NCCL:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
# 1. Stop Cluster (Best Effort)
cluster_manager.stop_cluster()
@@ -130,13 +141,17 @@ def get_cluster_env():
env["GLOO_SOCKET_IFNAME"] = rdma_iface
# RCCL specific
env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
if FORCE_DEBUG_NCCL:
env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
return env
def get_model_args(model):
@@ -166,7 +181,8 @@ def get_model_args(model):
def get_benchmark_output_file(model, output_dir):
model_safe = model.replace("/", "_")
output_dir_path = Path(output_dir)
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
eth_suffix = "_eth" if FORCE_ETH else ""
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
def run_bench_set(model, backend_name, output_dir, extra_env=None):
output_dir_path = Path(output_dir)
@@ -244,7 +260,9 @@ def run_cluster_throughput(model):
def print_summary():
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
eth_suffix = "_eth" if FORCE_ETH else ""
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
print(f"\n{f'MODEL (TP=2){title_suffix}':<50} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 75)
for m in MODELS_TO_RUN:
@@ -252,14 +270,14 @@ def print_summary():
# Default
try:
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
except: val1 = "N/A"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}_throughput.json"
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
except: val2 = "N/A"
@@ -269,14 +287,19 @@ def print_summary():
print("-" * 75)
if __name__ == "__main__":
# if not check_ray_status():
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
# sys.exit(1)
# We now handle this by restarting the cluster ourselves.
pass
parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
args = parser.parse_args()
FORCE_ETH = args.eth_only
FORCE_DEBUG_NCCL = args.debug_nccl
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
if FORCE_ETH:
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
if FORCE_DEBUG_NCCL:
log("Note: NCCL Debug mode enabled (Transport Logging).")
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
for m in MODELS_TO_RUN: