Minor fixes

2026-02-05 22:44:31 +01:00
Parent ceb4a9d3a1
@@ -34,7 +34,7 @@ RUN VERSION=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/l
 RUN chmod +x /app/llama-server

 # Create startup script for multiple instances
-RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nPARALLEL=${LLAMA_ARG_PARALLEL:-32}\nTHREADS=${LLAMA_ARG_THREADS:-16}\nBATCH_SIZE=${LLAMA_ARG_BATCH_SIZE:-2048}\nUBATCH=${LLAMA_ARG_UBATCH:-512}\nCTX_SIZE=${LLAMA_ARG_CTX_SIZE:-131072}\nHF_REPO=${LLAMA_ARG_HF_REPO:-unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K}\nHOST=${LLAMA_ARG_HOST:-0.0.0.0}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."\n    /app/llama-server \\\n        --host $HOST \\\n        --port $PORT \\\n        --parallel $PARALLEL \\\n        --threads $THREADS \\\n        --batch-size $BATCH_SIZE \\\n        --ubatch $UBATCH \\\n        --ctx-size $CTX_SIZE \\\n        --hf-repo $HF_REPO \\\n        --no-mmap \\\n        --log-disable \\\n        > /var/log/llama-server-$PORT.log 2>&1 &\n    sleep 2\ndone\n\necho "All instances started. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh
+RUN mkdir -p /app/bin /var/log && printf '#!/bin/bash\nset -e\n\nINSTANCES=${LLAMA_INSTANCES:-2}\nBASE_PORT=${LLAMA_BASE_PORT:-9000}\nREADY_TIMEOUT=${LLAMA_READY_TIMEOUT:-600}\n\necho "Starting $INSTANCES llama-server instances on ports $BASE_PORT-$((BASE_PORT+INSTANCES-1))"\n\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    echo "Starting instance $((i+1))/$INSTANCES on port $PORT..."\n    LLAMA_ARG_PORT=$PORT /app/llama-server \\\n        > /var/log/llama-server-$PORT.log 2>&1 &\n    sleep 3\ndone\n\necho "Waiting for servers to be ready..."\nfor ((i=0; i<INSTANCES; i++)); do\n    PORT=$((BASE_PORT + i))\n    elapsed=0\n    while [ $elapsed -lt $READY_TIMEOUT ]; do\n        if curl -s http://127.0.0.1:$PORT/health > /dev/null 2>&1; then\n            echo "Instance on port $PORT is ready"\n            break\n        fi\n        sleep 5\n        elapsed=$((elapsed + 5))\n    done\n    if [ $elapsed -ge $READY_TIMEOUT ]; then\n        echo "ERROR: Server on port $PORT did not become ready after ${READY_TIMEOUT}s"\n    fi\ndone\n\necho "All instances ready. Monitoring logs..."\ntail -f /var/log/llama-server-*.log &\nwait\n' > /app/bin/start-multi-servers.sh && chmod +x /app/bin/start-multi-servers.sh

 # Create nginx config template
 RUN mkdir -p /etc/nginx/conf.d && printf 'upstream llama_backend {\n    least_conn;\n    server 127.0.0.1:9000 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9001 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9002 max_fails=3 fail_timeout=30s;\n    server 127.0.0.1:9003 max_fails=3 fail_timeout=30s;\n}\n\nserver {\n    listen 8090;\n    server_name _;\n    \n    client_max_body_size 512M;\n    \n    location / {\n        proxy_pass http://llama_backend;\n        proxy_http_version 1.1;\n        proxy_set_header Upgrade $http_upgrade;\n        proxy_set_header Connection "upgrade";\n        proxy_set_header Host $host;\n        proxy_set_header X-Real-IP $remote_addr;\n        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n        proxy_set_header X-Forwarded-Proto $scheme;\n        proxy_buffering off;\n        proxy_request_buffering off;\n        proxy_read_timeout 600s;\n        proxy_connect_timeout 30s;\n    }\n    \n    location /health {\n        access_log off;\n        return 200 "healthy\\n";\n        add_header Content-Type text/plain;\n    }\n}\n' > /etc/nginx/conf.d/llama-upstream.conf
@@ -52,10 +52,10 @@ ENV LLAMA_BASE_PORT=9000
 ENV LLAMA_ARG_PARALLEL=32
 ENV LLAMA_ARG_THREADS=16
 ENV LLAMA_ARG_BATCH_SIZE=2048
-ENV LLAMA_ARG_UBATCH=512
 ENV LLAMA_ARG_CTX_SIZE=131072
 ENV LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q2_K
 ENV LLAMA_ARG_HOST=0.0.0.0
+ENV LLAMA_READY_TIMEOUT=600

 ENTRYPOINT ["/usr/bin/supervisord"]
 CMD ["-c", "/etc/supervisor/conf.d/llama-multi.conf"]
@@ -1,6 +1,6 @@
 [Container]
 ContainerName=llamacpp-multi
-Image=localhost/llamacpp:vulkan-amd64-multi
+Image=localhost/llamacpp:vulkan-multi-amd64
 #AutoUpdate=registry
 Network=internal.network
 PublishPort=8090:8090