[Container] ContainerName=llamacpp-multi Image=localhost/llamacpp:vulkan-multi-amd64 #AutoUpdate=registry Network=internal.network PublishPort=8090:8090 # Production - Lemonade usa Hugging Face Hub per i modelli Volume=/srv/containers/aitools/models:/root/.cache/llama.cpp # ROCm tuning AddDevice=/dev/dri/renderD128 PodmanArgs=--group-add=keep-groups --ipc=host SecurityLabelType=container_runtime_t # Multi-instance configuration (throughput optimized) Environment=LLAMA_INSTANCES=4 Environment=LLAMA_BASE_PORT=9000 Environment=LLAMA_ARG_HOST=0.0.0.0 Environment=LLAMA_ARG_PARALLEL=32 Environment=LLAMA_ARG_THREADS=16 Environment=LLAMA_ARG_BATCH_SIZE=2048 Environment=LLAMA_ARG_CTX_SIZE=131072 Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL # HF Environment=HF_HOME=/root/.cache/huggingface Environment=HF_TOKEN=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 [Service] Restart=on-failure TimeoutStartSec=15m [Install] WantedBy=multi-user.target default.target