Trying fix for ROCm/llvm loop unrolling bug, to see if performance improves on custom complied kernels
此提交包含在:
+5
-1
@@ -38,6 +38,8 @@ ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
|
|||||||
RUN git clone https://github.com/ROCm/flash-attention.git &&\
|
RUN git clone https://github.com/ROCm/flash-attention.git &&\
|
||||||
cd flash-attention &&\
|
cd flash-attention &&\
|
||||||
git checkout main_perf &&\
|
git checkout main_perf &&\
|
||||||
|
export CXXFLAGS="-mllvm --amdgpu-unroll-threshold-local=600" && \
|
||||||
|
export HIPCXXFLAGS="-mllvm --amdgpu-unroll-threshold-local=600" && \
|
||||||
python setup.py install && \
|
python setup.py install && \
|
||||||
cd /opt && rm -rf /opt/flash-attention
|
cd /opt && rm -rf /opt/flash-attention
|
||||||
|
|
||||||
@@ -68,7 +70,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"
|
|||||||
|
|
||||||
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
|
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
|
||||||
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
|
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
|
||||||
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
|
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151 -DCMAKE_CXX_FLAGS='-mllvm --amdgpu-unroll-threshold-local=600' -DCMAKE_HIP_FLAGS='-mllvm --amdgpu-unroll-threshold-local=600'" && \
|
||||||
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
|
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
|
||||||
python -m pip install /tmp/dist/*.whl
|
python -m pip install /tmp/dist/*.whl
|
||||||
|
|
||||||
@@ -90,6 +92,8 @@ RUN cmake -S . \
|
|||||||
-DCOMPUTE_BACKEND=hip \
|
-DCOMPUTE_BACKEND=hip \
|
||||||
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||||
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||||
|
-DCMAKE_CXX_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
|
||||||
|
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
|
||||||
&& \
|
&& \
|
||||||
make -j$(nproc) && \
|
make -j$(nproc) && \
|
||||||
python -m pip install --no-cache-dir . --no-build-isolation --no-deps
|
python -m pip install --no-cache-dir . --no-build-isolation --no-deps
|
||||||
|
|||||||
新增問題並參考
封鎖使用者