diff --git a/Services/llamacpp-multi.Containerfile b/Services/llamacpp-multi.Containerfile deleted file mode 100644 index a61c1dd..0000000 --- a/Services/llamacpp-multi.Containerfile +++ /dev/null @@ -1,61 +0,0 @@ -### LLaMACpp Multi-Instance Container with Nginx Load Balancer -### Based on llama-throughput-lab for maximum throughput -### Multiple llama-server instances + nginx for load balancing -### -### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile . -### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64 - - -FROM ubuntu:24.04 - -USER root -EXPOSE 8090 9000 9001 9002 9003 - -RUN apt-get update \ - && apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \ - && pip install --break-system-packages --upgrade setuptools \ - && pip install --break-system-packages -U "huggingface_hub[cli]" \ - && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && rm -rf /var/lib/apt/lists/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -WORKDIR /app - -RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": "\([^"]*\)".*/\1/') \ - && echo "Last llama.cpp version: $VERSION" \ - && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \ - && tar -xzf llama.tar.gz -C . --strip-components=1 \ - && rm llama.tar.gz - -RUN chmod +x /app/llama-server - -# Create startup script for multiple instances -RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nREADY_TIMEOUT=${LLAMA_READY_TIMEOUT:-600}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i /var/log/llama-server-$PORT.log 2>&1 &\n sleep 3\ndone\n\necho "Waiting for servers to be ready..."\nfor ((i=0; i /dev/null 2>&1; then\n echo "Instance on port $PORT is ready"\n break\n fi\n sleep 5\n elapsed=$((elapsed + 5))\n done\n if [ $elapsed -ge $READY_TIMEOUT ]; then\n echo "ERROR: Server on port $PORT did not become ready after ${READY_TIMEOUT}s"\n fi\ndone\n\necho "All instances ready. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh - -# Create nginx config template -RUN mkdir -p /etc/nginx/conf.d && printf 'upstream llama_backend {\n least_conn;\n server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;\n server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;\n}\n\nserver {\n listen 8090;\n server_name _;\n \n client_max_body_size 512M;\n \n location / {\n proxy_pass http://llama_backend;\n proxy_http_version 1.1;\n proxy_set_header Upgrade $http_upgrade;\n proxy_set_header Connection "upgrade";\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n proxy_set_header X-Forwarded-Proto $scheme;\n proxy_buffering off;\n proxy_request_buffering off;\n proxy_read_timeout 600s;\n proxy_connect_timeout 30s;\n }\n \n location /health {\n access_log off;\n return 200 "healthy\\n";\n add_header Content-Type text/plain;\n }\n}\n' > /etc/nginx/conf.d/llama-upstream.conf - -# Create supervisor config for managing both nginx and servers -RUN mkdir -p /etc/supervisor/conf.d && printf '[supervisord]\nnodaemon=true\nlogfile=/var/log/supervisor/supervisord.log\n\n[program:nginx]\ncommand=/usr/sbin/nginx -g "daemon off;"\nautostart=true\nautorestart=true\nstderr_logfile=/var/log/nginx/error.log\nstdout_logfile=/var/log/nginx/access.log\n\n[program:llama-servers]\ncommand=/app/bin/start-multi-servers.sh\nautostart=true\nautorestart=false\nstderr_logfile=/var/log/llama-servers.log\nstdout_logfile=/var/log/llama-servers.log\n' > /etc/supervisor/conf.d/llama-multi.conf - -WORKDIR /app - -ENV PATH=/app:/app/bin:$PATH -ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH -ENV HF_HUB_ENABLE_HF_TRANSFER=1 -ENV LLAMA_INSTANCES=2 -ENV LLAMA_BASE_PORT=9000 -ENV LLAMA_ARG_PARALLEL=32 -ENV LLAMA_ARG_THREADS=16 -ENV LLAMA_ARG_BATCH_SIZE=2048 -ENV LLAMA_ARG_CTX_SIZE=131072 -ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K -ENV LLAMA_ARG_HOST=0.0.0.0 -ENV LLAMA_READY_TIMEOUT=600 - -ENTRYPOINT ["/usr/bin/supervisord"] -CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"] diff --git a/Services/llamacpp-multi/README.md b/Services/llamacpp-multi/README.md new file mode 100644 index 0000000..d7e50e4 --- /dev/null +++ b/Services/llamacpp-multi/README.md @@ -0,0 +1,129 @@ +# LLaMACpp Multi-Instance Setup + +Guida per configurare e scalare il numero di istanze di llama-server con load balancing nginx. + +## Struttura Attuale + +- **4 istanze** di llama-server (porte 9000-9003) +- **Nginx** come load balancer (porta 8090) +- **Supervisor** per gestire tutti i processi + +## Aggiungere Istanze + +Se vuoi aumentare il numero di istanze, segui questi step: + +### 1. Modifica il Containerfile + +File: `llamacpp-multi.Containerfile` + +Cambia: +```dockerfile +ENV LLAMA_INSTANCES=4 +``` + +Con il numero di istanze desiderato (es. 6): +```dockerfile +ENV LLAMA_INSTANCES=6 +``` + +### 2. Aggiorna la Configurazione Nginx + +File: `llama-upstream.conf` + +Aggiungi i server nei porti nuovi nel blocco `upstream llama_backend`: + +```nginx +upstream llama_backend { + least_conn; + server 127.0.0.1:9000 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9001 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9002 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9003 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9004 max_fails=3 fail_timeout=30s; # NUOVO + server 127.0.0.1:9005 max_fails=3 fail_timeout=30s; # NUOVO +} +``` + +### 3. Aggiorna il Containerfile con le porte esposte + +File: `llamacpp-multi.Containerfile` + +Aggiungi le nuove porte: +```dockerfile +EXPOSE 8090 9000 9001 9002 9003 9004 9005 +``` + +### 4. Ricompila il Container + +```bash +cd /home/badstorm/Source/bdi/bdi_podman_serverconf/Services/llamacpp-multi +podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile . +``` + +### 5. Riavvia il Servizio + +```bash +systemctl restart llamacpp-multi +``` + +## Considerazioni di Risorse + +Ogni istanza consuma: +- **~8GB VRAM** (dipende dal modello e da `LLAMA_ARG_CTX_SIZE`) +- **~1-2 CPU core** (dipende dal carico) + +**Con GPU AMD Radeon (RENOIR):** +- 2 istanze: ✅ Stabile +- 4 istanze: ⚠️ Funziona ma monitorare memoria +- 6+ istanze: ❌ Probabilmente fuori di VRAM + +Monitora con: +```bash +podman stats llamacpp-multi +``` + +## Variabili di Ambiente Modificabili + +Nel file `.container` puoi sovrascrivere: + +```ini +Environment=LLAMA_ARG_PARALLEL=32 +Environment=LLAMA_ARG_THREADS=16 +Environment=LLAMA_ARG_BATCH_SIZE=2048 +Environment=LLAMA_ARG_CTX_SIZE=131072 +Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL +Environment=LLAMA_READY_TIMEOUT=600 +``` + +## Testing + +Una volta avviate le istanze, testa: + +```bash +curl http://localhost:8090/v1/models +``` + +Dovresti vedere il modello listato se tutte le istanze sono pronte. + +Test di carico (concurrent requests): +```bash +for i in {1..10}; do + curl -X POST http://localhost:8090/api/completion \ + -H "Content-Type: application/json" \ + -d '{"prompt": "Once upon a time", "n_predict": 64}' & +done +wait +``` + +## Troubleshooting + +**502 Bad Gateway:** +```bash +podman exec llamacpp-multi tail -f /var/log/llama-server-9000.log +``` + +**Timeout Ready:** +Aumenta `LLAMA_READY_TIMEOUT` se il modello impiega più di 10 minuti a caricare. + +**Out of Memory:** +Riduci `LLAMA_ARG_PARALLEL`, `LLAMA_ARG_BATCH_SIZE`, o `LLAMA_ARG_CTX_SIZE`. diff --git a/Services/llamacpp-multi/llama-multi.conf b/Services/llamacpp-multi/llama-multi.conf new file mode 100644 index 0000000..241ce74 --- /dev/null +++ b/Services/llamacpp-multi/llama-multi.conf @@ -0,0 +1,17 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log + +[program:nginx] +command=/usr/sbin/nginx -g "daemon off;" +autostart=true +autorestart=true +stderr_logfile=/var/log/nginx/error.log +stdout_logfile=/var/log/nginx/access.log + +[program:llama-servers] +command=/app/bin/start-multi-servers.sh +autostart=true +autorestart=false +stderr_logfile=/var/log/llama-servers.log +stdout_logfile=/var/log/llama-servers.log diff --git a/Services/llamacpp-multi/llama-upstream.conf b/Services/llamacpp-multi/llama-upstream.conf new file mode 100644 index 0000000..8a279bd --- /dev/null +++ b/Services/llamacpp-multi/llama-upstream.conf @@ -0,0 +1,35 @@ +upstream llama_backend { + least_conn; + server 127.0.0.1:9000 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9001 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9002 max_fails=3 fail_timeout=30s; + server 127.0.0.1:9003 max_fails=3 fail_timeout=30s; +} + +server { + listen 8090; + server_name _; + + client_max_body_size 512M; + + location / { + proxy_pass http://llama_backend; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + proxy_request_buffering off; + proxy_read_timeout 600s; + proxy_connect_timeout 30s; + } + + location /health { + access_log off; + return 200 "healthy\n"; + add_header Content-Type text/plain; + } +} diff --git a/Services/llamacpp-multi/llamacpp-multi.Containerfile b/Services/llamacpp-multi/llamacpp-multi.Containerfile new file mode 100644 index 0000000..9f37af8 --- /dev/null +++ b/Services/llamacpp-multi/llamacpp-multi.Containerfile @@ -0,0 +1,62 @@ +### LLaMACpp Multi-Instance Container with Nginx Load Balancer +### Based on llama-throughput-lab for maximum throughput +### Multiple llama-server instances + nginx for load balancing +### +### BUILD: podman build -t llamacpp:vulkan-multi-amd64 -f llamacpp-multi.Containerfile . +### Export: podman save -o /home/badstorm/llamacpp-vulkan-multi-amd64.tar localhost/llamacpp:vulkan-multi-amd64 + + +FROM ubuntu:24.04 + +USER root +EXPOSE 8090 9000 9001 9002 9003 + +RUN apt-get update \ + && apt-get install -y curl tar grep sed git ffmpeg nano python3-pip python3 python3-wheel nginx supervisor \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -U "huggingface_hub[cli]" \ + && if [ -f requirements.txt ]; then pip install --break-system-packages -r requirements.txt; fi \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && rm -rf /var/lib/apt/lists/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +WORKDIR /app + +RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": "\([^"]*\)".*/\1/') \ + && echo "Last llama.cpp version: $VERSION" \ + && curl -L https://github.com/ggml-org/llama.cpp/releases/download/${VERSION}/llama-${VERSION}-bin-ubuntu-vulkan-x64.tar.gz -o llama.tar.gz \ + && tar -xzf llama.tar.gz -C . --strip-components=1 \ + && rm llama.tar.gz + +RUN chmod +x /app/llama-server + +# Copy startup script for multiple instances +COPY start-multi-servers.sh /app/bin/ +RUN chmod +x /app/bin/start-multi-servers.sh + +# Copy nginx config +COPY llama-upstream.conf /etc/nginx/conf.d/ + +# Copy supervisor config +COPY llama-multi.conf /etc/supervisor/conf.d/ + +WORKDIR /app + +ENV PATH=/app:/app/bin:$PATH +ENV LD_LIBRARY_PATH=/app:$LD_LIBRARY_PATH +ENV HF_HUB_ENABLE_HF_TRANSFER=1 +ENV LLAMA_INSTANCES=4 +ENV LLAMA_BASE_PORT=9000 +ENV LLAMA_ARG_PARALLEL=32 +ENV LLAMA_ARG_THREADS=16 +ENV LLAMA_ARG_BATCH_SIZE=2048 +ENV LLAMA_ARG_CTX_SIZE=131072 +ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K +ENV LLAMA_ARG_HOST=0.0.0.0 +ENV LLAMA_READY_TIMEOUT=600 + +ENTRYPOINT ["/usr/bin/supervisord"] +CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"] diff --git a/Services/llamacpp-multi.container b/Services/llamacpp-multi/llamacpp-multi.container similarity index 86% rename from Services/llamacpp-multi.container rename to Services/llamacpp-multi/llamacpp-multi.container index 633e21f..82b4c32 100644 --- a/Services/llamacpp-multi.container +++ b/Services/llamacpp-multi/llamacpp-multi.container @@ -14,15 +14,14 @@ PodmanArgs=--group-add=keep-groups --ipc=host SecurityLabelType=container_runtime_t # Multi-instance configuration (throughput optimized) -Environment=LLAMA_INSTANCES=2 +Environment=LLAMA_INSTANCES=4 Environment=LLAMA_BASE_PORT=9000 Environment=LLAMA_ARG_HOST=0.0.0.0 Environment=LLAMA_ARG_PARALLEL=32 Environment=LLAMA_ARG_THREADS=16 Environment=LLAMA_ARG_BATCH_SIZE=2048 -Environment=LLAMA_ARG_UBATCH=512 Environment=LLAMA_ARG_CTX_SIZE=131072 -Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K +Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL # HF Environment=HF_HOME=/root/.cache/huggingface diff --git a/Services/llamacpp-multi/start-multi-servers.sh b/Services/llamacpp-multi/start-multi-servers.sh new file mode 100644 index 0000000..0161d1c --- /dev/null +++ b/Services/llamacpp-multi/start-multi-servers.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +INSTANCES=${LLAMA_INSTANCES:-2} +BASE_PORT=${LLAMA_BASE_PORT:-9000} +READY_TIMEOUT=${LLAMA_READY_TIMEOUT:-600} + +echo "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))" + +for ((i=0; i /var/log/llama-server-$PORT.log 2>&1 & + sleep 3 +done + +echo "Waiting for servers to be ready..." +for ((i=0; i /dev/null 2>&1; then + echo "Instance on port $PORT is ready" + break + fi + sleep 5 + elapsed=$((elapsed + 5)) + done + if [ $elapsed -ge $READY_TIMEOUT ]; then + echo "ERROR: Server on port $PORT did not become ready after ${READY_TIMEOUT}s" + fi +done + +echo "All instances ready. Monitoring logs..." +tail -f /var/log/llama-server-*.log & +wait