From 57fe9133ded9f286301256939ddbfeff8ff69332 Mon Sep 17 00:00:00 2001 From: BadStorm Date: Mon, 29 Jun 2026 10:29:35 +0200 Subject: [PATCH] Add embedding container --- .../llamacpp/llamacpp-embedding.container | 40 +++++++++++++++++++ Services/llamacpp/llamacpp.nginx | 19 ++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 Services/llamacpp/llamacpp-embedding.container diff --git a/Services/llamacpp/llamacpp-embedding.container b/Services/llamacpp/llamacpp-embedding.container new file mode 100644 index 0000000..e1207f5 --- /dev/null +++ b/Services/llamacpp/llamacpp-embedding.container @@ -0,0 +1,40 @@ +[Container] +ContainerName=llamacpp-embedding +Image=localhost/llamacpp:vulkan-amd64 +Network=internal.network +PublishPort=8091:8091 + +# Stessa cartella modelli del container chat +Volume=/srv/containers/aitools/models/hf:/root/.cache/huggingface/hub + +# ROCm / Vulkan — stessa GPU del container chat +AddDevice=/dev/dri/renderD128 +PodmanArgs=--group-add=keep-groups --ipc=host +SecurityLabelType=container_runtime_t + +# Porta dedicata all'embedding +Environment=LLAMA_ARG_HOST=0.0.0.0 +Environment=LLAMA_ARG_PORT=8091 + +# Modello di embedding leggero (~274MB Q8), multilingua (funziona bene anche in italiano) +# Alternativa: BAAI/bge-m3-GGUF per contesti multilingua più pesanti +Environment=LLAMA_ARG_HF_REPO=nomic-ai/nomic-embed-text-v1.5-GGUF:nomic-embed-text-v1.5.Q8_0.gguf + +# Flag fondamentale: avvia llama-server in modalità embedding-only +Environment=LLAMA_ARG_EMBEDDING=true + +Environment=LLAMA_ARG_NO_MMAP=true +# Contesto ridotto: gli embedding non hanno bisogno di 128k token +Environment=LLAMA_ARG_CTX_SIZE=8192 + +# HF +Environment=HF_HOME=/root/.cache/huggingface +Environment=HF_TOKEN=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 + +[Service] +Restart=on-failure +# Avvio veloce: il modello è piccolo +TimeoutStartSec=3m + +[Install] +WantedBy=multi-user.target default.target diff --git a/Services/llamacpp/llamacpp.nginx b/Services/llamacpp/llamacpp.nginx index 5ea22b1..1a8c109 100644 --- a/Services/llamacpp/llamacpp.nginx +++ b/Services/llamacpp/llamacpp.nginx @@ -40,7 +40,24 @@ server { # Dimensione massima del body (es. per upload di file) client_max_body_size 512M; - # Proxy verso llama.cpp + # Embedding: container dedicato (porta 8091, modello leggero sempre in memoria) + location /v1/embeddings { + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Authorization $http_authorization; + + # Timeout più breve: gli embedding sono veloci + proxy_connect_timeout 30s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + + proxy_pass http://[ip_address]:8091; + } + + # Chat / completions: container principale (porta 8090, modello Qwen3-Coder) location / { proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade;