Services/llamacpp-multi.Containerfile

### LLaMACpp Multi-Instance Container with Nginx Load Balancer
### Based on llama-throughput-lab for maximum throughput
### Multiple llama-server instances + nginx for load balancing
###
### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile .
### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64


FROM ubuntu:24.04

USER root
EXPOSE 8090 9000 9001 9002 9003

RUN apt-get update \
    && apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -U "huggingface_hub[cli]" \
    && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && rm -rf /var/lib/apt/lists/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

WORKDIR /app

RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": "\([^"]*\)".*/\1/') \
    && echo "Last llama.cpp version: $VERSION" \
    && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \
    && tar -xzf llama.tar.gz -C . --strip-components=1 \
    && rm llama.tar.gz

RUN chmod +x /app/llama-server

# Create startup script for multiple instances
RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nPARALLEL=${LLAMA_ARG_PARALLEL:-32}\nTHREADS=${LLAMA_ARG_THREADS:-16}\nBATCH_SIZE=${LLAMA_ARG_BATCH_SIZE:-2048}\nUBATCH=${LLAMA_ARG_UBATCH:-512}\nCTX_SIZE=${LLAMA_ARG_CTX_SIZE:-131072}\nHF_REPO=${LLAMA_ARG_HF_REPO:-unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K}\nHOST=${LLAMA_ARG_HOST:-0.0.0.0}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."\n    /app/llama-server \\\n        --host $HOST \\\n        --port $PORT \\\n        --parallel $PARALLEL \\\n        --threads $THREADS \\\n        --batch-size $BATCH_SIZE \\\n        --ubatch $UBATCH \\\n        --ctx-size $CTX_SIZE \\\n        --hf-repo $HF_REPO \\\n        --no-mmap \\\n        --log-disable \\\n        > /var/log/llama-server-$PORT.log 2>&1 &\n    sleep 2\ndone\n\necho "All instances started. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh

# Create nginx config template
RUN mkdir -p /etc/nginx/conf.d && printf 'upstream llama_backend {\n    least_conn;\n    server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;\n}\n\nserver {\n    listen 8090;\n    server_name _;\n    \n    client_max_body_size 512M;\n    \n    location / {\n        proxy_pass http://llama_backend;\n        proxy_http_version 1.1;\n        proxy_set_header Upgrade $http_upgrade;\n        proxy_set_header Connection "upgrade";\n        proxy_set_header Host $host;\n        proxy_set_header X-Real-IP $remote_addr;\n        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n        proxy_set_header X-Forwarded-Proto $scheme;\n        proxy_buffering off;\n        proxy_request_buffering off;\n        proxy_read_timeout 600s;\n        proxy_connect_timeout 30s;\n    }\n    \n    location /health {\n        access_log off;\n        return 200 "healthy\\n";\n        add_header Content-Type text/plain;\n    }\n}\n' > /etc/nginx/conf.d/llama-upstream.conf

# Create supervisor config for managing both nginx and servers
RUN mkdir -p /etc/supervisor/conf.d && printf '[supervisord]\nnodaemon=true\nlogfile=/var/log/supervisor/supervisord.log\n\n[program:nginx]\ncommand=/usr/sbin/nginx -g "daemon off;"\nautostart=true\nautorestart=true\nstderr_logfile=/var/log/nginx/error.log\nstdout_logfile=/var/log/nginx/access.log\n\n[program:llama-servers]\ncommand=/app/bin/start-multi-servers.sh\nautostart=true\nautorestart=false\nstderr_logfile=/var/log/llama-servers.log\nstdout_logfile=/var/log/llama-servers.log\n' > /etc/supervisor/conf.d/llama-multi.conf

WORKDIR /app

ENV PATH=/app:/app/bin:$PATH
ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV LLAMA_INSTANCES=2
ENV LLAMA_BASE_PORT=9000
ENV LLAMA_ARG_PARALLEL=32
ENV LLAMA_ARG_THREADS=16
ENV LLAMA_ARG_BATCH_SIZE=2048
ENV LLAMA_ARG_UBATCH=512
ENV LLAMA_ARG_CTX_SIZE=131072
ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K
ENV LLAMA_ARG_HOST=0.0.0.0

ENTRYPOINT ["/usr/bin/supervisord"]
CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"]
Add llamacpp-multi 2026-02-05 22:03:51 +01:00			`### LLaMACpp Multi-Instance Container with Nginx Load Balancer`
			`### Based on llama-throughput-lab for maximum throughput`
			`### Multiple llama-server instances + nginx for load balancing`
			`###`
			`### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile .`
			`### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64`


			`FROM ubuntu:24.04`

			`USER root`
			`EXPOSE 8090 9000 9001 9002 9003`

			`RUN apt-get update \`
			`&& apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \`
			`&& pip install --break-system-packages --upgrade setuptools \`
			`&& pip install --break-system-packages -U "huggingface_hub[cli]" \`
			`&& if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \`
			`&& apt autoremove -y \`
			`&& apt clean -y \`
			`&& rm -rf /tmp/* /var/tmp/* \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \`
			`&& find /var/cache -type f -delete`

			`WORKDIR /app`

			`RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \| grep '"tag_name"' \| head -1 \| sed 's/."tag_name": "\([^"]\)".*/\1/') \`
			`&& echo "Last llama.cpp version: $VERSION" \`
			`&& curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \`
			`&& tar -xzf llama.tar.gz -C . --strip-components=1 \`
			`&& rm llama.tar.gz`

			`RUN chmod +x /app/llama-server`

			`# Create startup script for multiple instances`
			RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nPARALLEL=${LLAMA_ARG_PARALLEL:-32}\nTHREADS=${LLAMA_ARG_THREADS:-16}\nBATCH_SIZE=${LLAMA_ARG_BATCH_SIZE:-2048}\nUBATCH=${LLAMA_ARG_UBATCH:-512}\nCTX_SIZE=${LLAMA_ARG_CTX_SIZE:-131072}\nHF_REPO=${LLAMA_ARG_HF_REPO:-unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K}\nHOST=${LLAMA_ARG_HOST:-0.0.0.0}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i<INSTANCES; i++)); do\n PORT=$((BASE_PORT + i))\n echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."\n /app/llama-server \\\n --host $HOST \\\n --port $PORT \\\n --parallel $PARALLEL \\\n --threads $THREADS \\\n --batch-size $BATCH_SIZE \\\n --ubatch $UBATCH \\\n --ctx-size $CTX_SIZE \\\n --hf-repo $HF_REPO \\\n --no-mmap \\\n --log-disable \\\n > /var/log/llama-server-$PORT.log 2>&1 &\n sleep 2\ndone\n\necho "All instances started. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh

			`# Create nginx config template`
			RUN mkdir -p /etc/nginx/conf.d && printf 'upstream llama_backend {\n least_conn;\n server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;\n}\n\nserver {\n listen 8090;\n server_name _;\n \n client_max_body_size 512M;\n \n location / {\n proxy_pass http://llama_backend;\n proxy_http_version 1.1;\n proxy_set_header Upgrade $http_upgrade;\n proxy_set_header Connection "upgrade";\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n proxy_set_header X-Forwarded-Proto $scheme;\n proxy_buffering off;\n proxy_request_buffering off;\n proxy_read_timeout 600s;\n proxy_connect_timeout 30s;\n }\n \n location /health {\n access_log off;\n return 200 "healthy\\n";\n add_header Content-Type text/plain;\n }\n}\n' > /etc/nginx/conf.d/llama-upstream.conf

			`# Create supervisor config for managing both nginx and servers`
			RUN mkdir -p /etc/supervisor/conf.d && printf '[supervisord]\nnodaemon=true\nlogfile=/var/log/supervisor/supervisord.log\n\n[program:nginx]\ncommand=/usr/sbin/nginx -g "daemon off;"\nautostart=true\nautorestart=true\nstderr_logfile=/var/log/nginx/error.log\nstdout_logfile=/var/log/nginx/access.log\n\n[program:llama-servers]\ncommand=/app/bin/start-multi-servers.sh\nautostart=true\nautorestart=false\nstderr_logfile=/var/log/llama-servers.log\nstdout_logfile=/var/log/llama-servers.log\n' > /etc/supervisor/conf.d/llama-multi.conf

			`WORKDIR /app`

			`ENV PATH=/app:/app/bin:$PATH`
			`ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH`
			`ENV HF_HUB_ENABLE_HF_TRANSFER=1`
			`ENV LLAMA_INSTANCES=2`
			`ENV LLAMA_BASE_PORT=9000`
			`ENV LLAMA_ARG_PARALLEL=32`
			`ENV LLAMA_ARG_THREADS=16`
			`ENV LLAMA_ARG_BATCH_SIZE=2048`
			`ENV LLAMA_ARG_UBATCH=512`
			`ENV LLAMA_ARG_CTX_SIZE=131072`
			`ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K`
			`ENV LLAMA_ARG_HOST=0.0.0.0`

			`ENTRYPOINT ["/usr/bin/supervisord"]`
			`CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"]`