feat: Configure ROCm attention via --attention-backend CLI argument, disable the Ray dashboard, and make eager mode configurable for cluster benchmarks.
이 커밋은 다음에 포함됨:
@@ -59,7 +59,7 @@ def setup_worker_node(worker_ip, head_ip):
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
|
||||
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
|
||||
"""
|
||||
|
||||
print(f"Setting up Worker Node ({worker_ip})...")
|
||||
@@ -97,7 +97,7 @@ def setup_head_node(head_ip):
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Head on {head_ip}..."
|
||||
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
|
||||
"""
|
||||
|
||||
try:
|
||||
|
||||
+1
-3
@@ -309,9 +309,7 @@ def configure_and_launch(model_idx, gpu_count):
|
||||
env.update(config.get("env", {}))
|
||||
|
||||
if use_rocm_attn:
|
||||
env["VLLM_V1_USE_PREFILL_DECODE_ATTENTION"] = "1"
|
||||
env["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
# Optional: Explicitly mention these in print
|
||||
cmd.extend(["--attention-backend", "ROCM_ATTN"])
|
||||
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
새 이슈에서 참조
사용자 차단