Split config files

2026-02-05 23:27:16 +01:00
--- a/Services/llamacpp-multi.Containerfile
+++ b/Services/llamacpp-multi.Containerfile
@@ -1,61 +0,0 @@
 ### LLaMACpp Multi-Instance Container with Nginx Load Balancer
 ### Based on llama-throughput-lab for maximum throughput
 ### Multiple llama-server instances + nginx for load balancing
 ###
 ### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile .
 ### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64
 FROM ubuntu:24.04
 USER root
 EXPOSE 8090 9000 9001 9002 9003
 RUN apt-get update \
    && apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -U "huggingface_hub[cli]" \
    && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && rm -rf /var/lib/apt/lists/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 WORKDIR /app
 RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": "\([^"]*\)".*/\1/') \
    && echo "Last llama.cpp version: $VERSION" \
    && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \
    && tar -xzf llama.tar.gz -C . --strip-components=1 \
    && rm llama.tar.gz
 RUN chmod +x /app/llama-server
 # Create startup script for multiple instances
 RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nREADY_TIMEOUT=${LLAMA_READY_TIMEOUT:-600}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."\n    LLAMA_ARG_PORT=$PORT /app/llama-server \\\n        > /var/log/llama-server-$PORT.log 2>&1 &\n    sleep 3\ndone\n\necho "Waiting for servers to be ready..."\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    elapsed=0\n    while [ $elapsed -lt $READY_TIMEOUT ]; do\n        if curl -s http://127.0.0.1:$PORT/health > /dev/null 2>&1; then\n            echo "Instance on port $PORT is ready"\n            break\n        fi\n        sleep 5\n        elapsed=$((elapsed + 5))\n    done\n    if [ $elapsed -ge $READY_TIMEOUT ]; then\n        echo "ERROR: Server on port $PORT did not become ready after ${READY_TIMEOUT}s"\n    fi\ndone\n\necho "All instances ready. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh
 # Create nginx config template
 RUN mkdir -p /etc/nginx/conf.d && printf 'upstream llama_backend {\n    least_conn;\n    server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;\n}\n\nserver {\n    listen 8090;\n    server_name _;\n    \n    client_max_body_size 512M;\n    \n    location / {\n        proxy_pass http://llama_backend;\n        proxy_http_version 1.1;\n        proxy_set_header Upgrade $http_upgrade;\n        proxy_set_header Connection "upgrade";\n        proxy_set_header Host $host;\n        proxy_set_header X-Real-IP $remote_addr;\n        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n        proxy_set_header X-Forwarded-Proto $scheme;\n        proxy_buffering off;\n        proxy_request_buffering off;\n        proxy_read_timeout 600s;\n        proxy_connect_timeout 30s;\n    }\n    \n    location /health {\n        access_log off;\n        return 200 "healthy\\n";\n        add_header Content-Type text/plain;\n    }\n}\n' > /etc/nginx/conf.d/llama-upstream.conf
 # Create supervisor config for managing both nginx and servers
 RUN mkdir -p /etc/supervisor/conf.d && printf '[supervisord]\nnodaemon=true\nlogfile=/var/log/supervisor/supervisord.log\n\n[program:nginx]\ncommand=/usr/sbin/nginx -g "daemon off;"\nautostart=true\nautorestart=true\nstderr_logfile=/var/log/nginx/error.log\nstdout_logfile=/var/log/nginx/access.log\n\n[program:llama-servers]\ncommand=/app/bin/start-multi-servers.sh\nautostart=true\nautorestart=false\nstderr_logfile=/var/log/llama-servers.log\nstdout_logfile=/var/log/llama-servers.log\n' > /etc/supervisor/conf.d/llama-multi.conf
 WORKDIR /app
 ENV PATH=/app:/app/bin:$PATH
 ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV LLAMA_INSTANCES=2
 ENV LLAMA_BASE_PORT=9000
 ENV LLAMA_ARG_PARALLEL=32
 ENV LLAMA_ARG_THREADS=16
 ENV LLAMA_ARG_BATCH_SIZE=2048
 ENV LLAMA_ARG_CTX_SIZE=131072
 ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K
 ENV LLAMA_ARG_HOST=0.0.0.0
 ENV LLAMA_READY_TIMEOUT=600
 ENTRYPOINT ["/usr/bin/supervisord"]
 CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"]
--- a/Services/llamacpp-multi/README.md
+++ b/Services/llamacpp-multi/README.md
@@ -0,0 +1,129 @@
 # LLaMACpp Multi-Instance Setup
 Guida per configurare e scalare il numero di istanze di llama-server con load balancing nginx.
 ## Struttura Attuale
 - **4 istanze** di llama-server (porte 9000-9003)
 - **Nginx** come load balancer (porta 8090)
 - **Supervisor** per gestire tutti i processi
 ## Aggiungere Istanze
 Se vuoi aumentare il numero di istanze, segui questi step:
 ### 1. Modifica il Containerfile
 File: `llamacpp-multi.Containerfile`
 Cambia:
 ```dockerfile
 ENV LLAMA_INSTANCES=4
 ```
 Con il numero di istanze desiderato (es. 6):
 ```dockerfile
 ENV LLAMA_INSTANCES=6
 ```
 ### 2. Aggiorna la Configurazione Nginx
 File: `llama-upstream.conf`
 Aggiungi i server nei porti nuovi nel blocco `upstream llama_backend`:
 ```nginx
 upstream llama_backend {
    least_conn;
    server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9004 max_fails=3 fail_timeout=30s;  # NUOVO
    server 127.0.0.1:9005 max_fails=3 fail_timeout=30s;  # NUOVO
 }
 ```
 ### 3. Aggiorna il Containerfile con le porte esposte
 File: `llamacpp-multi.Containerfile`
 Aggiungi le nuove porte:
 ```dockerfile
 EXPOSE 8090 9000 9001 9002 9003 9004 9005
 ```
 ### 4. Ricompila il Container
 ```bash
 cd /home/badstorm/Source/bdi/bdi_podman_serverconf/Services/llamacpp-multi
 podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile .
 ```
 ### 5. Riavvia il Servizio
 ```bash
 systemctl restart llamacpp-multi
 ```
 ## Considerazioni di Risorse
 Ogni istanza consuma:
 - **~8GB VRAM** (dipende dal modello e da `LLAMA_ARG_CTX_SIZE`)
 - **~1-2 CPU core** (dipende dal carico)
 **Con GPU AMD Radeon (RENOIR):**
 - 2 istanze: ✅ Stabile
 - 4 istanze: ⚠️ Funziona ma monitorare memoria
 - 6+ istanze: ❌ Probabilmente fuori di VRAM
 Monitora con:
 ```bash
 podman stats llamacpp-multi
 ```
 ## Variabili di Ambiente Modificabili
 Nel file `.container` puoi sovrascrivere:
 ```ini
 Environment=LLAMA_ARG_PARALLEL=32
 Environment=LLAMA_ARG_THREADS=16
 Environment=LLAMA_ARG_BATCH_SIZE=2048
 Environment=LLAMA_ARG_CTX_SIZE=131072
 Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL
 Environment=LLAMA_READY_TIMEOUT=600
 ```
 ## Testing
 Una volta avviate le istanze, testa:
 ```bash
 curl http://localhost:8090/v1/models
 ```
 Dovresti vedere il modello listato se tutte le istanze sono pronte.
 Test di carico (concurrent requests):
 ```bash
 for i in {1..10}; do
  curl -X POST http://localhost:8090/api/completion \
    -H "Content-Type: application/json" \
    -d '{"prompt": "Once upon a time", "n_predict": 64}' &
 done
 wait
 ```
 ## Troubleshooting
 **502 Bad Gateway:**
 ```bash
 podman exec llamacpp-multi tail -f /var/log/llama-server-9000.log
 ```
 **Timeout Ready:**
 Aumenta `LLAMA_READY_TIMEOUT` se il modello impiega più di 10 minuti a caricare.
 **Out of Memory:**
 Riduci `LLAMA_ARG_PARALLEL`, `LLAMA_ARG_BATCH_SIZE`, o `LLAMA_ARG_CTX_SIZE`.
--- a/Services/llamacpp-multi/llama-multi.conf
+++ b/Services/llamacpp-multi/llama-multi.conf
@@ -0,0 +1,17 @@
 [supervisord]
 nodaemon=true
 logfile=/var/log/supervisor/supervisord.log
 [program:nginx]
 command=/usr/sbin/nginx -g "daemon off;"
 autostart=true
 autorestart=true
 stderr_logfile=/var/log/nginx/error.log
 stdout_logfile=/var/log/nginx/access.log
 [program:llama-servers]
 command=/app/bin/start-multi-servers.sh
 autostart=true
 autorestart=false
 stderr_logfile=/var/log/llama-servers.log
 stdout_logfile=/var/log/llama-servers.log
--- a/Services/llamacpp-multi/llama-upstream.conf
+++ b/Services/llamacpp-multi/llama-upstream.conf
@@ -0,0 +1,35 @@
 upstream llama_backend {
    least_conn;
    server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;
 }
 server {
    listen 8090;
    server_name _;
    client_max_body_size 512M;
    location / {
        proxy_pass http://llama_backend;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        proxy_buffering off;
        proxy_request_buffering off;
        proxy_read_timeout 600s;
        proxy_connect_timeout 30s;
    }
    location /health {
        access_log off;
        return 200 "healthy\n";
        add_header Content-Type text/plain;
    }
 }
--- a/Services/llamacpp-multi/llamacpp-multi.Containerfile
+++ b/Services/llamacpp-multi/llamacpp-multi.Containerfile
@@ -0,0 +1,62 @@
 ### LLaMACpp Multi-Instance Container with Nginx Load Balancer
 ### Based on llama-throughput-lab for maximum throughput
 ### Multiple llama-server instances + nginx for load balancing
 ###
 ### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile .
 ### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64
 FROM ubuntu:24.04
 USER root
 EXPOSE 8090 9000 9001 9002 9003
 RUN apt-get update \
    && apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -U "huggingface_hub[cli]" \
    && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && rm -rf /var/lib/apt/lists/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 WORKDIR /app
 RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": "\([^"]*\)".*/\1/') \
    && echo "Last llama.cpp version: $VERSION" \
    && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \
    && tar -xzf llama.tar.gz -C . --strip-components=1 \
    && rm llama.tar.gz
 RUN chmod +x /app/llama-server
 # Copy startup script for multiple instances
 COPY start-multi-servers.sh /app/bin/
 RUN chmod +x /app/bin/start-multi-servers.sh
 # Copy nginx config
 COPY llama-upstream.conf /etc/nginx/conf.d/
 # Copy supervisor config
 COPY llama-multi.conf /etc/supervisor/conf.d/
 WORKDIR /app
 ENV PATH=/app:/app/bin:$PATH
 ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV LLAMA_INSTANCES=4
 ENV LLAMA_BASE_PORT=9000
 ENV LLAMA_ARG_PARALLEL=32
 ENV LLAMA_ARG_THREADS=16
 ENV LLAMA_ARG_BATCH_SIZE=2048
 ENV LLAMA_ARG_CTX_SIZE=131072
 ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K
 ENV LLAMA_ARG_HOST=0.0.0.0
 ENV LLAMA_READY_TIMEOUT=600
 ENTRYPOINT ["/usr/bin/supervisord"]
 CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"]
--- a/Services/llamacpp-multi/llamacpp-multi.container
+++ b/Services/llamacpp-multi/llamacpp-multi.container
@@ -14,15 +14,14 @@ PodmanArgs=--group-add=keep-groups --ipc=host
 SecurityLabelType=container_runtime_t
 # Multi-instance configuration (throughput optimized)
-Environment=LLAMA_INSTANCES=2
+Environment=LLAMA_INSTANCES=4
 Environment=LLAMA_BASE_PORT=9000
 Environment=LLAMA_ARG_HOST=0.0.0.0
 Environment=LLAMA_ARG_PARALLEL=32
 Environment=LLAMA_ARG_THREADS=16
 Environment=LLAMA_ARG_BATCH_SIZE=2048
 Environment=LLAMA_ARG_UBATCH=512
 Environment=LLAMA_ARG_CTX_SIZE=131072
-Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K
+Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL
 # HF
 Environment=HF_HOME=/root/.cache/huggingface
--- a/Services/llamacpp-multi/start-multi-servers.sh
+++ b/Services/llamacpp-multi/start-multi-servers.sh
@@ -0,0 +1,37 @@
 #!/bin/bash
 set -e
 INSTANCES=${LLAMA_INSTANCES:-2}
 BASE_PORT=${LLAMA_BASE_PORT:-9000}
 READY_TIMEOUT=${LLAMA_READY_TIMEOUT:-600}
 echo "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"
 for ((i=0; i<INSTANCES; i++)); do
    PORT=$((BASE_PORT + i))
    echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."
    LLAMA_ARG_PORT=$PORT /app/llama-server \
        > /var/log/llama-server-$PORT.log 2>&1 &
    sleep 3
 done
 echo "Waiting for servers to be ready..."
 for ((i=0; i<INSTANCES; i++)); do
    PORT=$((BASE_PORT + i))
    elapsed=0
    while [ $elapsed -lt $READY_TIMEOUT ]; do
        if curl -s http://127.0.0.1:$PORT/health > /dev/null 2>&1; then
            echo "Instance on port $PORT is ready"
            break
        fi
        sleep 5
        elapsed=$((elapsed + 5))
    done
    if [ $elapsed -ge $READY_TIMEOUT ]; then
        echo "ERROR: Server on port $PORT did not become ready after ${READY_TIMEOUT}s"
    fi
 done
 echo "All instances ready. Monitoring logs..."
 tail -f /var/log/llama-server-*.log &
 wait