### LLaMACpp Builder Container with Vulkan for GPUs ### Multi-stage: download stage with pre-built binaries, runtime stage with only runtime libraries ### ### BUILD: podman build -t llamacpp:vulkan-amd64 -f llama-vulkan.Containerfile . ### Export: podman save -o /home/duckpage/llamacpp-vulkan-amd64.tar localhost/llamacpp:vulkan-amd64 ARG UBUNTU_VERSION=24.04 ### Download image FROM ubuntu:${UBUNTU_VERSION} AS download RUN apt-get update \ && apt-get install -y curl unzip grep sed \ && rm -rf /var/lib/apt/lists/* WORKDIR /tmp RUN VERSION=$(curl -s -I https://github.com/ggml-org/llama.cpp/releases/latest | grep -i location | sed 's|.*/tag/||' | tr -d '\r') \ && echo "Last llama.cpp version: $VERSION" \ && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.zip -o llama.zip \ && unzip llama.zip \ && rm llama.zip \ && if [ -d llama-* ]; then mv llama-*/* . && rmdir llama-*; elif [ -d build ]; then mv build/* . && rmdir build; fi \ && if [ -d bin ]; then mv bin/* . && rmdir bin; fi # flatten further RUN mkdir -p /app/lib /app/full \ && find . -name "*.so" -exec cp {} /app/lib \; \ && cp -r * /app/full 2>/dev/null || true \ && ls -la /app/full # list contents ## Base image FROM ubuntu:${UBUNTU_VERSION} AS base RUN apt-get update \ && apt-get install -y libgomp1 curl nano ca-certificates wget\ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ && find /var/cache -type f -delete COPY --from=download /app/lib/ /app ### Full FROM base AS full COPY --from=download /app/full /app RUN chmod +x /app/llama-server WORKDIR /app RUN apt-get update \ && apt-get install -y \ libvulkan-dev \ git \ python3-pip \ python3 \ python3-wheel\ && pip install --break-system-packages --upgrade setuptools \ && pip install --break-system-packages -U "huggingface_hub[cli]" \ && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ && find /var/cache -type f -delete # -------- Model args (prefer Q6 to keep mmap on and avoid load issues) -------- ARG GENERAL_FAST_MODEL="-m models/gemma-3-1b-it-Q5_K_M.gguf" ARG GENERAL_MODEL="-m models/gpt-oss-20b-Q4_K_M.gguf" ARG CHAT_MODEL="-m models/Qwen3-VL-30B-A3B-Q4_K_S.gguf" ARG CODER_MODEL="-m models/Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf" ARG EMBEDDING_FAST_MODEL="-m models/embeddinggemma-300M-Q8_0.gguf" ARG EMBEDDING_MODEL="-m models/bge-code-v1-q6_k.gguf" # -------- Runtime defaults -------- ARG GENERAL_CONTEXT_SIZE=16384 ARG GENERAL_GPU_LAYERS=99 ARG GENERAL_MAX_TOKENS=512 ARG CODER_CONTEXT_SIZE=131072 ARG CODER_GPU_LAYERS=99 ARG CODER_MAX_TOKENS=512 ENV GENERAL_FAST_MODEL=${GENERAL_FAST_MODEL} ENV GENERAL_MODEL=${GENERAL_MODEL} ENV CODER_MODEL=${CODER_MODEL} ENV EMBEDDING_FAST_MODEL=${EMBEDDING_FAST_MODEL} ENV EMBEDDING_MODEL=${EMBEDDING_MODEL} ENV GENERAL_CONTEXT_SIZE=${GENERAL_CONTEXT_SIZE} ENV GENERAL_GPU_LAYERS=${GENERAL_GPU_LAYERS} ENV GENERAL_MAX_TOKENS=${GENERAL_MAX_TOKENS} ENV CODER_CONTEXT_SIZE=${CODER_CONTEXT_SIZE} ENV CODER_GPU_LAYERS=${CODER_GPU_LAYERS} ENV CODER_MAX_TOKENS=${CODER_MAX_TOKENS} # -------- llama-swap -------- RUN curl -L https://github.com/mostlygeek/llama-swap/releases/download/v165/llama-swap_165_linux_amd64.tar.gz -o /tmp/llama-swap.tar.gz \ && tar -xzf /tmp/llama-swap.tar.gz -C /app \ && rm /tmp/llama-swap.tar.gz # -------- start/stop scripts -------- # Nota: usiamo --threads -1 --threads-batch -1 per lasciare a llama.cpp l'autotuning COPY ./Scripts/ /app/Scripts/ RUN chmod +x /app/Scripts/*.sh # -------- Copy preset config file -------- COPY ./config.preset.yaml /app/config.preset.yaml # -------- Copy entrypoint script -------- COPY ./entrypoint.sh /app/entrypoint.sh RUN chmod +x /app/entrypoint.sh ENTRYPOINT ["/app/entrypoint.sh"]