feat: Add RAY_DISABLE_METRICS=1 to disable Ray metrics across cluster configurations and scripts.
This commit is contained in:
@@ -117,6 +117,7 @@ export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
|||||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||||
|
|
||||||
# Prevent Ray from masking the APU (Strix Halo Requirement)
|
# Prevent Ray from masking the APU (Strix Halo Requirement)
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
|
||||||
# Start Head
|
# Start Head
|
||||||
@@ -132,6 +133,7 @@ ray stop --force
|
|||||||
export RDMA_IFACE=$(ip -o addr show to 192.168.100.0/24 | awk '{print $2}' | head -n1)
|
export RDMA_IFACE=$(ip -o addr show to 192.168.100.0/24 | awk '{print $2}' | head -n1)
|
||||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
|
||||||
ray start --address='192.168.100.1:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
ray start --address='192.168.100.1:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ setup_head() {
|
|||||||
ray stop --force
|
ray stop --force
|
||||||
|
|
||||||
# Critical Config
|
# Critical Config
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
export RAY_memory_monitor_refresh_ms=0
|
export RAY_memory_monitor_refresh_ms=0
|
||||||
export VLLM_HOST_IP=$HEAD_IP
|
export VLLM_HOST_IP=$HEAD_IP
|
||||||
@@ -78,6 +79,7 @@ setup_worker() {
|
|||||||
ray stop --force
|
ray stop --force
|
||||||
|
|
||||||
# Critical Config
|
# Critical Config
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
export RAY_memory_monitor_refresh_ms=0
|
export RAY_memory_monitor_refresh_ms=0
|
||||||
export VLLM_HOST_IP=$WORKER_IP
|
export VLLM_HOST_IP=$WORKER_IP
|
||||||
|
|||||||
@@ -141,6 +141,7 @@ def setup_worker_node(worker_ip, head_ip):
|
|||||||
source /etc/profile
|
source /etc/profile
|
||||||
# Silece the kill command
|
# Silece the kill command
|
||||||
ray stop --force > /dev/null 2>&1 || true
|
ray stop --force > /dev/null 2>&1 || true
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
export RAY_memory_monitor_refresh_ms=0
|
export RAY_memory_monitor_refresh_ms=0
|
||||||
export VLLM_HOST_IP={worker_ip}
|
export VLLM_HOST_IP={worker_ip}
|
||||||
@@ -175,6 +176,7 @@ def setup_head_node(head_ip):
|
|||||||
script = f"""
|
script = f"""
|
||||||
# Silence the kill command
|
# Silence the kill command
|
||||||
ray stop --force > /dev/null 2>&1 || true
|
ray stop --force > /dev/null 2>&1 || true
|
||||||
|
export RAY_DISABLE_METRICS=1
|
||||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
export RAY_memory_monitor_refresh_ms=0
|
export RAY_memory_monitor_refresh_ms=0
|
||||||
export VLLM_HOST_IP={head_ip}
|
export VLLM_HOST_IP={head_ip}
|
||||||
|
|||||||
Verwijs in nieuw issue
Block a user