From fedfa3c6821e528c75344660b2137773543f7c75 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Mon, 23 Feb 2026 11:43:44 +0000 Subject: [PATCH] Trying fix for ROCm/llvm loop unrolling bug, to see if performance improves on custom complied kernels --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 50620b9..f3ba481 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,8 @@ ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" RUN git clone https://github.com/ROCm/flash-attention.git &&\ cd flash-attention &&\ git checkout main_perf &&\ + export CXXFLAGS="-mllvm --amdgpu-unroll-threshold-local=600" && \ + export HIPCXXFLAGS="-mllvm --amdgpu-unroll-threshold-local=600" && \ python setup.py install && \ cd /opt && rm -rf /opt/flash-attention @@ -68,7 +70,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++" RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \ echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \ - export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \ + export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151 -DCMAKE_CXX_FLAGS='-mllvm --amdgpu-unroll-threshold-local=600' -DCMAKE_HIP_FLAGS='-mllvm --amdgpu-unroll-threshold-local=600'" && \ python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \ python -m pip install /tmp/dist/*.whl @@ -90,6 +92,8 @@ RUN cmake -S . \ -DCOMPUTE_BACKEND=hip \ -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_CXX_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \ + -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \ && \ make -j$(nproc) && \ python -m pip install --no-cache-dir . --no-build-isolation --no-deps