122 sor
4.1 KiB
Docker
122 sor
4.1 KiB
Docker
### LLaMACpp Builder Container with Vulkan for GPUs
|
|
### Multi-stage: download stage with pre-built binaries, runtime stage with only runtime libraries
|
|
###
|
|
### BUILD: podman build -t llamacpp:vulkan-amd64 -f llama-vulkan.Containerfile .
|
|
### Export: podman save -o /home/duckpage/llamacpp-vulkan-amd64.tar localhost/llamacpp:vulkan-amd64
|
|
|
|
|
|
ARG UBUNTU_VERSION=24.04
|
|
|
|
### Download image
|
|
FROM ubuntu:${UBUNTU_VERSION} AS download
|
|
|
|
RUN apt-get update \
|
|
&& apt-get install -y curl unzip grep sed \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
WORKDIR /tmp
|
|
|
|
RUN VERSION=$(curl -s -I https://github.com/ggml-org/llama.cpp/releases/latest | grep -i location | sed 's|.*/tag/||' | tr -d '\r') \
|
|
&& echo "Last llama.cpp version: $VERSION" \
|
|
&& curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.zip -o llama.zip \
|
|
&& unzip llama.zip \
|
|
&& rm llama.zip \
|
|
&& if [ -d llama-* ]; then mv llama-*/* . && rmdir llama-*; elif [ -d build ]; then mv build/* . && rmdir build; fi \
|
|
&& if [ -d bin ]; then mv bin/* . && rmdir bin; fi # flatten further
|
|
|
|
RUN mkdir -p /app/lib /app/full \
|
|
&& find . -name "*.so" -exec cp {} /app/lib \; \
|
|
&& cp -r * /app/full 2>/dev/null || true \
|
|
&& ls -la /app/full # list contents
|
|
|
|
## Base image
|
|
FROM ubuntu:${UBUNTU_VERSION} AS base
|
|
|
|
RUN apt-get update \
|
|
&& apt-get install -y libgomp1 curl nano ca-certificates wget\
|
|
&& apt autoremove -y \
|
|
&& apt clean -y \
|
|
&& rm -rf /tmp/* /var/tmp/* \
|
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
&& find /var/cache -type f -delete
|
|
|
|
COPY --from=download /app/lib/ /app
|
|
|
|
### Full
|
|
FROM base AS full
|
|
|
|
COPY --from=download /app/full /app
|
|
|
|
RUN chmod +x /app/llama-server
|
|
|
|
WORKDIR /app
|
|
|
|
RUN apt-get update \
|
|
&& apt-get install -y \
|
|
libvulkan-dev \
|
|
git \
|
|
python3-pip \
|
|
python3 \
|
|
python3-wheel\
|
|
&& pip install --break-system-packages --upgrade setuptools \
|
|
&& pip install --break-system-packages -U "huggingface_hub[cli]" \
|
|
&& if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \
|
|
&& apt autoremove -y \
|
|
&& apt clean -y \
|
|
&& rm -rf /tmp/* /var/tmp/* \
|
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
&& find /var/cache -type f -delete
|
|
|
|
# -------- Model args (prefer Q6 to keep mmap on and avoid load issues) --------
|
|
ARG GENERAL_FAST_MODEL="-m models/gemma-3-1b-it-Q5_K_M.gguf"
|
|
ARG GENERAL_MODEL="-m models/gpt-oss-20b-Q4_K_M.gguf"
|
|
|
|
ARG CHAT_MODEL="-m models/Qwen3-VL-30B-A3B-Q4_K_S.gguf"
|
|
|
|
ARG CODER_MODEL="-m models/Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf"
|
|
|
|
ARG EMBEDDING_FAST_MODEL="-m models/embeddinggemma-300M-Q8_0.gguf"
|
|
ARG EMBEDDING_MODEL="-m models/bge-code-v1-q6_k.gguf"
|
|
|
|
# -------- Runtime defaults --------
|
|
ARG GENERAL_CONTEXT_SIZE=16384
|
|
ARG GENERAL_GPU_LAYERS=99
|
|
ARG GENERAL_MAX_TOKENS=512
|
|
|
|
ARG CODER_CONTEXT_SIZE=131072
|
|
ARG CODER_GPU_LAYERS=99
|
|
ARG CODER_MAX_TOKENS=512
|
|
|
|
ENV GENERAL_FAST_MODEL=${GENERAL_FAST_MODEL}
|
|
ENV GENERAL_MODEL=${GENERAL_MODEL}
|
|
ENV CODER_MODEL=${CODER_MODEL}
|
|
ENV EMBEDDING_FAST_MODEL=${EMBEDDING_FAST_MODEL}
|
|
ENV EMBEDDING_MODEL=${EMBEDDING_MODEL}
|
|
|
|
ENV GENERAL_CONTEXT_SIZE=${GENERAL_CONTEXT_SIZE}
|
|
ENV GENERAL_GPU_LAYERS=${GENERAL_GPU_LAYERS}
|
|
ENV GENERAL_MAX_TOKENS=${GENERAL_MAX_TOKENS}
|
|
ENV CODER_CONTEXT_SIZE=${CODER_CONTEXT_SIZE}
|
|
ENV CODER_GPU_LAYERS=${CODER_GPU_LAYERS}
|
|
ENV CODER_MAX_TOKENS=${CODER_MAX_TOKENS}
|
|
|
|
# -------- llama-swap --------
|
|
RUN curl -L https://github.com/mostlygeek/llama-swap/releases/download/v165/llama-swap_165_linux_amd64.tar.gz -o /tmp/llama-swap.tar.gz \
|
|
&& tar -xzf /tmp/llama-swap.tar.gz -C /app \
|
|
&& rm /tmp/llama-swap.tar.gz
|
|
|
|
# -------- start/stop scripts --------
|
|
# Nota: usiamo --threads -1 --threads-batch -1 per lasciare a llama.cpp l'autotuning
|
|
|
|
COPY ./Scripts/ /app/Scripts/
|
|
RUN chmod +x /app/Scripts/*.sh
|
|
|
|
# -------- Copy preset config file --------
|
|
COPY ./config.preset.yaml /app/config.preset.yaml
|
|
|
|
# -------- Copy entrypoint script --------
|
|
COPY ./entrypoint.sh /app/entrypoint.sh
|
|
RUN chmod +x /app/entrypoint.sh
|
|
|
|
ENTRYPOINT ["/app/entrypoint.sh"]
|