Installing flash_attn, as this is now neded by vLLM

2025-11-30 17:49:29 +00:00
@@ -63,6 +63,15 @@ RUN python -m pip install \

 WORKDIR /opt

+# Flash-Attention
+ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+
+RUN git clone https://github.com/ROCm/flash-attention.git &&\ 
+    cd flash-attention &&\
+    git checkout main_perf &&\
+    python setup.py install && \
+    cd /opt && rm -rf /opt/flash-attention
+
 # 6. Clone vLLM
 RUN git clone https://github.com/vllm-project/vllm.git /opt/vllm
 WORKDIR /opt/vllm
@@ -1,3 +1,4 @@
 # Required for Strix Halo / RDNA3.5 on vLLM
 export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
+export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
 export VLLM_TARGET_DEVICE=rocm