[Container]
ContainerName=llamacpp-multi
Image=localhost/llamacpp:vulkan-multi-amd64
#AutoUpdate=registry
Network=internal.network
PublishPort=8090:8090

# Production - Lemonade usa Hugging Face Hub per i modelli
Volume=/srv/containers/aitools/models:/root/.cache/llama.cpp

# ROCm tuning
AddDevice=/dev/dri/renderD128
PodmanArgs=--group-add=keep-groups --ipc=host
SecurityLabelType=container_runtime_t

# Multi-instance configuration (throughput optimized)
Environment=LLAMA_INSTANCES=4
Environment=LLAMA_BASE_PORT=9000
Environment=LLAMA_ARG_HOST=0.0.0.0
Environment=LLAMA_ARG_PARALLEL=32
Environment=LLAMA_ARG_THREADS=16
Environment=LLAMA_ARG_BATCH_SIZE=2048
Environment=LLAMA_ARG_CTX_SIZE=131072
Environment=LLAMA_ARG_HF_REPO=unsloth/Qwen3-Coder-Next-GGUF:Q2_K_XL

# HF
Environment=HF_HOME=/root/.cache/huggingface
Environment=HF_TOKEN=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


[Service]
Restart=on-failure
TimeoutStartSec=15m

[Install]
WantedBy=multi-user.target default.target