From 0d8afba0935edd7ea5c6971294fa4ed0a6ec573d Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 1 Feb 2026 21:52:48 +0000 Subject: [PATCH] feat: Add `RAY_DISABLE_METRICS=1` to disable Ray metrics across cluster configurations and scripts. --- rdma_cluster/troubleshooting_rccl.md | 2 ++ scripts/configure_cluster.sh | 2 ++ scripts/start_vllm_cluster.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/rdma_cluster/troubleshooting_rccl.md b/rdma_cluster/troubleshooting_rccl.md index 8903f26..e57501d 100644 --- a/rdma_cluster/troubleshooting_rccl.md +++ b/rdma_cluster/troubleshooting_rccl.md @@ -117,6 +117,7 @@ export GLOO_SOCKET_IFNAME=$RDMA_IFACE export NCCL_SOCKET_IFNAME=$RDMA_IFACE # Prevent Ray from masking the APU (Strix Halo Requirement) +export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 # Start Head @@ -132,6 +133,7 @@ ray stop --force export RDMA_IFACE=$(ip -o addr show to 192.168.100.0/24 | awk '{print $2}' | head -n1) export GLOO_SOCKET_IFNAME=$RDMA_IFACE export NCCL_SOCKET_IFNAME=$RDMA_IFACE +export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ray start --address='192.168.100.1:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats diff --git a/scripts/configure_cluster.sh b/scripts/configure_cluster.sh index 31aad9d..49fcc1f 100755 --- a/scripts/configure_cluster.sh +++ b/scripts/configure_cluster.sh @@ -54,6 +54,7 @@ setup_head() { ray stop --force # Critical Config + export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_memory_monitor_refresh_ms=0 export VLLM_HOST_IP=$HEAD_IP @@ -78,6 +79,7 @@ setup_worker() { ray stop --force # Critical Config + export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_memory_monitor_refresh_ms=0 export VLLM_HOST_IP=$WORKER_IP diff --git a/scripts/start_vllm_cluster.py b/scripts/start_vllm_cluster.py index 938a582..9fcfb7f 100755 --- a/scripts/start_vllm_cluster.py +++ b/scripts/start_vllm_cluster.py @@ -141,6 +141,7 @@ def setup_worker_node(worker_ip, head_ip): source /etc/profile # Silece the kill command ray stop --force > /dev/null 2>&1 || true + export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_memory_monitor_refresh_ms=0 export VLLM_HOST_IP={worker_ip} @@ -175,6 +176,7 @@ def setup_head_node(head_ip): script = f""" # Silence the kill command ray stop --force > /dev/null 2>&1 || true + export RAY_DISABLE_METRICS=1 export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 export RAY_memory_monitor_refresh_ms=0 export VLLM_HOST_IP={head_ip}