diff --git a/Dockerfile b/Dockerfile index 8a15d4b..c209bb2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN dnf -y install --setopt=install_weak_deps=False --nodocs \ gcc gcc-c++ binutils make ffmpeg-free \ cmake ninja-build aria2c tar xz vim nano dialog \ libdrm-devel zlib-devel openssl-devel pgrep \ - numactl-devel gperftools-libs iproute libibverbs-utils patch \ + numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \ && dnf clean all && rm -rf /var/cache/dnf/* # 2. Install "TheRock" ROCm SDK (Tarball Method) diff --git a/rdma_cluster/compare_eth_vs_rdma.sh b/rdma_cluster/compare_eth_vs_rdma.sh new file mode 100755 index 0000000..bf9330e --- /dev/null +++ b/rdma_cluster/compare_eth_vs_rdma.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +# -------- dynamic config -------- +HOST_ROCE="192.168.100.2" +HOST_ETH="192.168.1.127" + +# Automatically detect local and remote RDMA device names +RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}') +RDMA_DEV_REMOTE=$(ssh fw2 "ibv_devices | awk 'NR==3 {print \$1}'") + +WORKDIR="/tmp/rdma_bench" +mkdir -p "$WORKDIR" + +# -------- helpers -------- +parse_ping_avg() { + grep rtt "$1" | awk -F'/' '{print $5}' +} + +parse_iperf_gbps() { + grep receiver "$1" | tail -n1 | awk ' + { + val=$(NF-2); + unit=$(NF-1); + if (unit=="Mbits/sec") printf "%.2f", val/1000; + else if (unit=="Gbits/sec") printf "%.2f", val; + else print "N/A"; + }' +} + +parse_rdma_lat_us() { + val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}') + echo "${val:-0}" +} + +parse_rdma_bw_mib() { + val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}') + echo "${val:-0}" +} + +# -------- normal ethernet -------- +ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt" +ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 & +sleep 1 +iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt" + +# -------- roce ethernet (tcp) -------- +ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt" +ssh fw2 "iperf3 -s -1" >/dev/null 2>&1 & +sleep 1 +iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt" + +# -------- rdma latency -------- +ssh fw2 "ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 & +sleep 2 +ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1 + +# -------- rdma bandwidth (maximized) -------- +# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1 +ssh fw2 "ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 & +sleep 2 +ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1 + +# -------- parse -------- +ETH_LAT=$(parse_ping_avg "$WORKDIR/ping_eth.txt") +ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt") + +ROCE_LAT=$(parse_ping_avg "$WORKDIR/ping_roce.txt") +ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt") + +RDMA_LAT=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt") +RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt") +RDMA_BW_GBPS=$(python3 - <, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all', '+sparse_attn_indexer'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'static_all_moe_layers': []} +(EngineCore_DP0 pid=17822) 2026-01-31 14:20:59,483 INFO worker.py:1821 -- Connecting to existing Ray cluster at address: 192.168.100.1:6379... +(EngineCore_DP0 pid=17822) 2026-01-31 14:20:59,492 INFO worker.py:2007 -- Connected to Ray cluster. +(EngineCore_DP0 pid=17822) /opt/venv/lib64/python3.13/site-packages/ray/_private/worker.py:2046: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0 +(EngineCore_DP0 pid=17822) warnings.warn( +(EngineCore_DP0 pid=17822) INFO 01-31 14:20:59 [ray_utils.py:402] No current placement group found. Creating a new placement group. +(EngineCore_DP0 pid=17822) WARNING 01-31 14:20:59 [ray_utils.py:213] tensor_parallel_size=2 is bigger than a reserved number of GPUs (1 GPUs) in a node 5a3b1f77fbfa9166383761707a12bb40f0a03d4f635e26acf703b0cc. Tensor parallel workers can be spread out to 2+ nodes which can degrade the performance unless you have fast interconnect across nodes, like Infiniband. To resolve this issue, make sure you have more than 2 GPUs available at each node. +(EngineCore_DP0 pid=17822) WARNING 01-31 14:20:59 [ray_utils.py:213] tensor_parallel_size=2 is bigger than a reserved number of GPUs (1 GPUs) in a node 709eb6806677f4171ba13b3fc6aee20da41d68f3ef22acbce6fce967. Tensor parallel workers can be spread out to 2+ nodes which can degrade the performance unless you have fast interconnect across nodes, like Infiniband. To resolve this issue, make sure you have more than 2 GPUs available at each node. +(EngineCore_DP0 pid=17822) INFO 01-31 14:21:02 [ray_env.py:66] RAY_NON_CARRY_OVER_ENV_VARS from config: set() +(EngineCore_DP0 pid=17822) INFO 01-31 14:21:02 [ray_env.py:69] Copying the following environment variables to workers: ['VLLM_ROCM_USE_AITER', 'VLLM_WORKER_MULTIPROC_METHOD', 'MAX_JOBS', 'LD_LIBRARY_PATH', 'VLLM_USE_TRITON_AWQ', 'VLLM_TARGET_DEVICE', 'VLLM_ROCM_USE_AITER_MOE'] +(EngineCore_DP0 pid=17822) INFO 01-31 14:21:02 [ray_env.py:74] If certain env vars should NOT be copied, add them to /home/kyuz0/.config/vllm/ray_non_carry_over_env_vars.json file +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) WARNING 01-31 14:21:03 [worker_base.py:301] Missing `shared_worker_lock` argument from executor. This argument is needed for mm_processor_cache_type='shm'. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) INFO 01-31 14:21:03 [parallel_state.py:1234] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://192.168.100.1:49483 backend=nccl +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) INFO 01-31 14:21:03 [pynccl.py:111] vLLM is using nccl==2.27.7 +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] EngineCore failed to start. +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] Traceback (most recent call last): +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 937, in run_engine_core +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 691, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] super().__init__( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] vllm_config, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<3 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] internal_dp_balancing, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 105, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.model_executor = executor_class(vllm_config) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/abstract.py", line 101, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self._init_executor() +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 97, in _init_executor +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self._init_workers_ray(placement_group) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 366, in _init_workers_ray +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.collective_rpc("init_device") +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 489, in collective_rpc +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return ray.get(ray_worker_outputs, timeout=timeout) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return fn(*args, **kwargs) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/ray/_private/worker.py", line 2967, in get +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] values, debugger_breakpoint = worker.get_objects( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] object_refs, timeout=timeout, _tensor_transport=_tensor_transport +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/ray/_private/worker.py", line 1015, in get_objects +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] raise value.as_instanceof_cause() +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ray.exceptions.RayTaskError(AcceleratorError): ray::RayWorkerWrapper.execute_method() (pid=17931, ip=192.168.100.1, actor_id=e2ed7f9c6fbd6274f30c736001000000, repr=) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 345, in execute_method +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] raise e +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 334, in execute_method +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return run_method(self, method, args, kwargs) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/serial_utils.py", line 459, in run_method +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 326, in init_device +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.worker.init_device() # type: ignore +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 221, in init_device +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] init_worker_distributed_environment( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.vllm_config, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<3 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] current_platform.dist_backend, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 959, in init_worker_distributed_environment +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ensure_model_parallel_initialized( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] parallel_config.tensor_parallel_size, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<2 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] parallel_config.decode_context_parallel_size, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1472, in ensure_model_parallel_initialized +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] initialize_model_parallel( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] tensor_model_parallel_size, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<3 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] backend, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1369, in initialize_model_parallel +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] _TP = init_model_parallel_group( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] group_ranks, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<3 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] group_name="tp", +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1089, in init_model_parallel_group +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] return GroupCoordinator( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] group_ranks=group_ranks, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<4 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] group_name=group_name, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 362, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.device_communicator = device_comm_cls( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] cpu_group=self.cpu_group, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ...<2 lines>... +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] unique_name=self.unique_name, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 58, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] self.pynccl_comm = PyNcclCommunicator( +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] group=self.cpu_group, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] device=self.device, +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] ^ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/pynccl.py", line 145, in __init__ +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] data = torch.zeros(1, device=device) +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] torch.AcceleratorError: HIP error: invalid kernel file +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] Search for `hipErrorInvalidKernelFile' in https://rocm.docs.amd.com/projects/HIP/en/latest/index.html for more information. +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] For debugging consider passing AMD_SERIALIZE_KERNEL=3 +(EngineCore_DP0 pid=17822) ERROR 01-31 14:21:03 [core.py:946] Device-side assertion tracking was not enabled by user. +(EngineCore_DP0 pid=17822) Process EngineCore_DP0: +(EngineCore_DP0 pid=17822) Traceback (most recent call last): +(EngineCore_DP0 pid=17822) File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap +(EngineCore_DP0 pid=17822) self.run() +(EngineCore_DP0 pid=17822) ~~~~~~~~^^ +(EngineCore_DP0 pid=17822) File "/usr/lib64/python3.13/multiprocessing/process.py", line 108, in run +(EngineCore_DP0 pid=17822) self._target(*self._args, **self._kwargs) +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 950, in run_engine_core +(EngineCore_DP0 pid=17822) raise e +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 937, in run_engine_core +(EngineCore_DP0 pid=17822) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 691, in __init__ +(EngineCore_DP0 pid=17822) super().__init__( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) vllm_config, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) internal_dp_balancing, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/engine/core.py", line 105, in __init__ +(EngineCore_DP0 pid=17822) self.model_executor = executor_class(vllm_config) +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/abstract.py", line 101, in __init__ +(EngineCore_DP0 pid=17822) self._init_executor() +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 97, in _init_executor +(EngineCore_DP0 pid=17822) self._init_workers_ray(placement_group) +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 366, in _init_workers_ray +(EngineCore_DP0 pid=17822) self.collective_rpc("init_device") +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/executor/ray_executor.py", line 489, in collective_rpc +(EngineCore_DP0 pid=17822) return ray.get(ray_worker_outputs, timeout=timeout) +(EngineCore_DP0 pid=17822) ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper +(EngineCore_DP0 pid=17822) return fn(*args, **kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper +(EngineCore_DP0 pid=17822) return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/ray/_private/worker.py", line 2967, in get +(EngineCore_DP0 pid=17822) values, debugger_breakpoint = worker.get_objects( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) object_refs, timeout=timeout, _tensor_transport=_tensor_transport +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/ray/_private/worker.py", line 1015, in get_objects +(EngineCore_DP0 pid=17822) raise value.as_instanceof_cause() +(EngineCore_DP0 pid=17822) ray.exceptions.RayTaskError(AcceleratorError): ray::RayWorkerWrapper.execute_method() (pid=17931, ip=192.168.100.1, actor_id=e2ed7f9c6fbd6274f30c736001000000, repr=) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 345, in execute_method +(EngineCore_DP0 pid=17822) raise e +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 334, in execute_method +(EngineCore_DP0 pid=17822) return run_method(self, method, args, kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/serial_utils.py", line 459, in run_method +(EngineCore_DP0 pid=17822) return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 326, in init_device +(EngineCore_DP0 pid=17822) self.worker.init_device() # type: ignore +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 221, in init_device +(EngineCore_DP0 pid=17822) init_worker_distributed_environment( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) self.vllm_config, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) current_platform.dist_backend, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 959, in init_worker_distributed_environment +(EngineCore_DP0 pid=17822) ensure_model_parallel_initialized( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) parallel_config.tensor_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<2 lines>... +(EngineCore_DP0 pid=17822) parallel_config.decode_context_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1472, in ensure_model_parallel_initialized +(EngineCore_DP0 pid=17822) initialize_model_parallel( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) tensor_model_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) backend, +(EngineCore_DP0 pid=17822) ^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1369, in initialize_model_parallel +(EngineCore_DP0 pid=17822) _TP = init_model_parallel_group( +(EngineCore_DP0 pid=17822) group_ranks, +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) group_name="tp", +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1089, in init_model_parallel_group +(EngineCore_DP0 pid=17822) return GroupCoordinator( +(EngineCore_DP0 pid=17822) group_ranks=group_ranks, +(EngineCore_DP0 pid=17822) ...<4 lines>... +(EngineCore_DP0 pid=17822) group_name=group_name, +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 362, in __init__ +(EngineCore_DP0 pid=17822) self.device_communicator = device_comm_cls( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) cpu_group=self.cpu_group, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<2 lines>... +(EngineCore_DP0 pid=17822) unique_name=self.unique_name, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 58, in __init__ +(EngineCore_DP0 pid=17822) self.pynccl_comm = PyNcclCommunicator( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) group=self.cpu_group, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) device=self.device, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/pynccl.py", line 145, in __init__ +(EngineCore_DP0 pid=17822) data = torch.zeros(1, device=device) +(EngineCore_DP0 pid=17822) torch.AcceleratorError: HIP error: invalid kernel file +(EngineCore_DP0 pid=17822) Search for `hipErrorInvalidKernelFile' in https://rocm.docs.amd.com/projects/HIP/en/latest/index.html for more information. +(EngineCore_DP0 pid=17822) HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +(EngineCore_DP0 pid=17822) For debugging consider passing AMD_SERIALIZE_KERNEL=3 +(EngineCore_DP0 pid=17822) Device-side assertion tracking was not enabled by user. +(EngineCore_DP0 pid=17822) INFO 01-31 14:21:03 [ray_executor.py:120] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray. +(EngineCore_DP0 pid=17822) 2026-01-31 14:21:03,366 ERROR worker.py:433 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerWrapper.execute_method() (pid=9409, ip=192.168.100.2, actor_id=4ec5a2c97a621462a00d303801000000, repr=) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 345, in execute_method +(EngineCore_DP0 pid=17822) raise e +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 334, in execute_method +(EngineCore_DP0 pid=17822) return run_method(self, method, args, kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/serial_utils.py", line 459, in run_method +(EngineCore_DP0 pid=17822) return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 326, in init_device +(EngineCore_DP0 pid=17822) self.worker.init_device() # type: ignore +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 221, in init_device +(EngineCore_DP0 pid=17822) init_worker_distributed_environment( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) self.vllm_config, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) current_platform.dist_backend, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 959, in init_worker_distributed_environment +(EngineCore_DP0 pid=17822) ensure_model_parallel_initialized( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) parallel_config.tensor_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<2 lines>... +(EngineCore_DP0 pid=17822) parallel_config.decode_context_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1472, in ensure_model_parallel_initialized +(EngineCore_DP0 pid=17822) initialize_model_parallel( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) tensor_model_parallel_size, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) backend, +(EngineCore_DP0 pid=17822) ^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1369, in initialize_model_parallel +(EngineCore_DP0 pid=17822) _TP = init_model_parallel_group( +(EngineCore_DP0 pid=17822) group_ranks, +(EngineCore_DP0 pid=17822) ...<3 lines>... +(EngineCore_DP0 pid=17822) group_name="tp", +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1089, in init_model_parallel_group +(EngineCore_DP0 pid=17822) return GroupCoordinator( +(EngineCore_DP0 pid=17822) group_ranks=group_ranks, +(EngineCore_DP0 pid=17822) ...<4 lines>... +(EngineCore_DP0 pid=17822) group_name=group_name, +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 362, in __init__ +(EngineCore_DP0 pid=17822) self.device_communicator = device_comm_cls( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) cpu_group=self.cpu_group, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ...<2 lines>... +(EngineCore_DP0 pid=17822) unique_name=self.unique_name, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 58, in __init__ +(EngineCore_DP0 pid=17822) self.pynccl_comm = PyNcclCommunicator( +(EngineCore_DP0 pid=17822) ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) group=self.cpu_group, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) device=self.device, +(EngineCore_DP0 pid=17822) ^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) ) +(EngineCore_DP0 pid=17822) ^ +(EngineCore_DP0 pid=17822) File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/pynccl.py", line 145, in __init__ +(EngineCore_DP0 pid=17822) data = torch.zeros(1, device=device) +(EngineCore_DP0 pid=17822) torch.AcceleratorError: HIP error: invalid kernel file +(EngineCore_DP0 pid=17822) Search for `hipErrorInvalidKernelFile' in https://rocm.docs.amd.com/projects/HIP/en/latest/index.html for more information. +(EngineCore_DP0 pid=17822) HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +(EngineCore_DP0 pid=17822) For debugging consider passing AMD_SERIALIZE_KERNEL=3 +(EngineCore_DP0 pid=17822) Device-side assertion tracking was not enabled by user. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] Error executing method 'init_device'. This might cause deadlock in distributed execution. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] Traceback (most recent call last): +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 334, in execute_method +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] return run_method(self, method, args, kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/serial_utils.py", line 459, in run_method +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/ray/util/tracing/tracing_helper.py", line 461, in _resume_span +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] return method(self, *_args, **_kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 326, in init_device +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] self.worker.init_device() # type: ignore +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 221, in init_device +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] init_worker_distributed_environment( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] self.vllm_config, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<3 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] current_platform.dist_backend, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 959, in init_worker_distributed_environment +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ensure_model_parallel_initialized( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] parallel_config.tensor_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<2 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] parallel_config.decode_context_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1472, in ensure_model_parallel_initialized +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] initialize_model_parallel( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] tensor_model_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<3 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] backend, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1369, in initialize_model_parallel +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] _TP = init_model_parallel_group( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] group_ranks, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<3 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] group_name="tp", +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1089, in init_model_parallel_group +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] return GroupCoordinator( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] group_ranks=group_ranks, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<4 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] group_name=group_name, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 362, in __init__ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] self.device_communicator = device_comm_cls( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] cpu_group=self.cpu_group, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ...<2 lines>... +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] unique_name=self.unique_name, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 58, in __init__ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] self.pynccl_comm = PyNcclCommunicator( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] group=self.cpu_group, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] device=self.device, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] ^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/pynccl.py", line 145, in __init__ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] data = torch.zeros(1, device=device) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] torch.AcceleratorError: HIP error: invalid kernel file +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] Search for `hipErrorInvalidKernelFile' in https://rocm.docs.amd.com/projects/HIP/en/latest/index.html for more information. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] For debugging consider passing AMD_SERIALIZE_KERNEL=3 +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) ERROR 01-31 14:21:03 [worker_base.py:344] Device-side assertion tracking was not enabled by user. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) WARNING 01-31 14:21:03 [worker_base.py:301] Missing `shared_worker_lock` argument from executor. This argument is needed for mm_processor_cache_type='shm'. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=9409, ip=192.168.100.2) INFO 01-31 14:21:03 [parallel_state.py:1234] world_size=2 rank=1 local_rank=0 distributed_init_method=tcp://192.168.100.1:49483 backend=nccl +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] Error executing method 'init_device'. This might cause deadlock in distributed execution. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] Traceback (most recent call last): +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 334, in execute_method +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] return run_method(self, method, args, kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/serial_utils.py", line 459, in run_method +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] return func(*args, **kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/ray/util/tracing/tracing_helper.py", line 461, in _resume_span +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] return method(self, *_args, **_kwargs) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 221, in init_device [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] self.worker.init_device() # type: ignore +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] init_worker_distributed_environment( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] self.vllm_config, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ...<2 lines>... [repeated 6x across cluster] +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] current_platform.dist_backend, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ) [repeated 7x across cluster] +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^ [repeated 5x across cluster] +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 959, in init_worker_distributed_environment +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ensure_model_parallel_initialized( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] parallel_config.tensor_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] parallel_config.decode_context_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1472, in ensure_model_parallel_initialized +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] initialize_model_parallel( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] tensor_model_parallel_size, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] backend, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1369, in initialize_model_parallel +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] _TP = init_model_parallel_group( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] group_ranks, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] group_name="tp", +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/parallel_state.py", line 1089, in init_model_parallel_group +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] return GroupCoordinator( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] group_ranks=group_ranks, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] group_name=group_name, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] File "/opt/venv/lib64/python3.13/site-packages/vllm/distributed/device_communicators/pynccl.py", line 145, in __init__ [repeated 3x across cluster] +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] self.device_communicator = device_comm_cls( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] cpu_group=self.cpu_group, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] unique_name=self.unique_name, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] self.pynccl_comm = PyNcclCommunicator( +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ~~~~~~~~~~~~~~~~~~^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] group=self.cpu_group, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] device=self.device, +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] ^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] data = torch.zeros(1, device=device) +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] torch.AcceleratorError: HIP error: invalid kernel file +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] Search for `hipErrorInvalidKernelFile' in https://rocm.docs.amd.com/projects/HIP/en/latest/index.html for more information. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] For debugging consider passing AMD_SERIALIZE_KERNEL=3 +(EngineCore_DP0 pid=17822) (RayWorkerWrapper pid=17931) ERROR 01-31 14:21:03 [worker_base.py:344] Device-side assertion tracking was not enabled by user. + +``` + +### 4.1 - Possible reasons + +This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged: + +https://github.com/ROCm/rccl/pull/2075 + +Instead of being merged, this was closed when AMD moved ROCm to rocm-systems. Right now there is a feature request: + +https://github.com/ROCm/rocm-systems/issues/2788 + +Again, I do not know if this is the issue or if the invalid kernel is a different issue altogether. \ No newline at end of file